mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
78 Commits
go/feature
...
al/ci_fixe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a01814e991 | ||
|
|
b5d50cec5a | ||
|
|
c619eb479e | ||
|
|
4e8bdc4380 | ||
|
|
1deaaf5249 | ||
|
|
abd2fe1f4e | ||
|
|
47d671b043 | ||
|
|
f700016776 | ||
|
|
6fabe6bab0 | ||
|
|
91171c738d | ||
|
|
7bf0dc157d | ||
|
|
0612ef5be5 | ||
|
|
aee4c1ed18 | ||
|
|
e2a3ef151a | ||
|
|
6f77bea5e0 | ||
|
|
e4f72dab30 | ||
|
|
7ed3fded4a | ||
|
|
488c942a3a | ||
|
|
c0d98394fa | ||
|
|
93ff6992e2 | ||
|
|
2a4026c761 | ||
|
|
39c424b14d | ||
|
|
46a7a3b43b | ||
|
|
38b5759e88 | ||
|
|
d6f8e59394 | ||
|
|
a95db07003 | ||
|
|
6544e6f6a3 | ||
|
|
1d549dfd8a | ||
|
|
019548daa5 | ||
|
|
26b666955a | ||
|
|
ce9da12e65 | ||
|
|
32b45ac4bc | ||
|
|
26055b236e | ||
|
|
ce9e355c15 | ||
|
|
85cc638c62 | ||
|
|
d454b5386b | ||
|
|
426f3bd192 | ||
|
|
4c707e79d8 | ||
|
|
e1afb8126d | ||
|
|
0d1ef0af7e | ||
|
|
15e3474cda | ||
|
|
10be6f9423 | ||
|
|
c521c2ca2e | ||
|
|
39c46056f6 | ||
|
|
aa2b27460c | ||
|
|
c258d53625 | ||
|
|
8ddee20a57 | ||
|
|
1d786b7202 | ||
|
|
7267d60e01 | ||
|
|
0148a6ffc8 | ||
|
|
63571a07ae | ||
|
|
6e2908ad4e | ||
|
|
d3d06c905f | ||
|
|
051f33f166 | ||
|
|
11a8f97a1c | ||
|
|
35a9c323a7 | ||
|
|
641f47b775 | ||
|
|
456d0ced1b | ||
|
|
358bcc9a22 | ||
|
|
27a4564d83 | ||
|
|
296e419f6c | ||
|
|
e1a25a10ac | ||
|
|
d9349b3357 | ||
|
|
68e4ac4896 | ||
|
|
3f318a2046 | ||
|
|
d1380794ed | ||
|
|
fe5641ef6d | ||
|
|
3397aa81d2 | ||
|
|
8f10f8f8db | ||
|
|
92be95c6b8 | ||
|
|
990c4d0380 | ||
|
|
1d5abfd5ea | ||
|
|
dfd1beeb47 | ||
|
|
43a007a2fa | ||
|
|
54faf64ecd | ||
|
|
8fe7f9c3cb | ||
|
|
9ed65db03d | ||
|
|
9413d3e722 |
2
.github/workflows/aws_tfhe_fast_tests.yml
vendored
2
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -56,7 +56,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
52
.github/workflows/aws_tfhe_integer_tests.yml
vendored
52
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -19,21 +19,53 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -60,7 +92,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -103,7 +135,7 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, unsigned-integer-tests ]
|
||||
needs: [setup-instance, unsigned-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
|
||||
@@ -19,21 +19,53 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-tests)
|
||||
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -60,7 +92,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -107,7 +139,7 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, signed-integer-tests ]
|
||||
needs: [setup-instance, signed-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
|
||||
2
.github/workflows/aws_tfhe_tests.yml
vendored
2
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
2
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
2
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -71,6 +71,8 @@ jobs:
|
||||
make test_nodejs_wasm_api_in_docker
|
||||
|
||||
- name: Run parallel wasm tests
|
||||
# test timeouts are at 60 but if we want a log we need to give time to the step to log stuff
|
||||
timeout-minutes: 65
|
||||
run: |
|
||||
make test_web_js_api_parallel_ci
|
||||
|
||||
|
||||
@@ -98,7 +98,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -113,16 +113,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -86,7 +86,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -101,16 +101,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on downloaded artifact"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_multi_bit_gpu_default
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -90,16 +90,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
@@ -164,7 +156,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -128,7 +128,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -143,16 +143,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on downloaded artifact"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -159,16 +159,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -152,16 +152,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -147,7 +147,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -167,7 +167,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -182,17 +182,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -164,7 +164,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -172,16 +172,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -39,7 +39,7 @@ jobs:
|
||||
profile: multi-h100
|
||||
|
||||
cuda-integer-full-multi-gpu-benchmarks:
|
||||
name: Execute multi GPU integer benchmarks for all operations flavor
|
||||
name: Execute multi GPU integer benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
@@ -48,8 +48,8 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
command: [integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -152,16 +152,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -139,7 +139,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -147,16 +147,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -149,16 +149,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -139,7 +139,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -147,16 +147,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -39,7 +39,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -103,6 +103,8 @@ jobs:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run benchmarks
|
||||
# test timeouts are at 60 but if we want a log we need to give time to the step to log stuff
|
||||
timeout-minutes: 65
|
||||
run: |
|
||||
make install_node
|
||||
make bench_web_js_api_parallel_ci
|
||||
@@ -130,7 +132,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -145,16 +147,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
if: needs.setup-instance.result != 'skipped'
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_zk
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -153,16 +153,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
19
.github/workflows/cargo_build.yml
vendored
19
.github/workflows/cargo_build.yml
vendored
@@ -19,14 +19,21 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
os: [large_ubuntu_16, macos-latest-large, large_windows_16_latest]
|
||||
# GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
|
||||
# even with a few PRs
|
||||
os: [large_ubuntu_16, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
|
||||
echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
|
||||
@@ -36,27 +43,33 @@ jobs:
|
||||
make check_newline
|
||||
|
||||
- name: Run pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make pcc
|
||||
|
||||
- name: Build concrete-csprng
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_concrete_csprng
|
||||
|
||||
- name: Build Release core
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_core AVX512_SUPPORT=ON
|
||||
make build_core_experimental AVX512_SUPPORT=ON
|
||||
|
||||
- name: Build Release boolean
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_boolean
|
||||
|
||||
- name: Build Release shortint
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_shortint
|
||||
|
||||
- name: Build Release integer
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_integer
|
||||
|
||||
@@ -65,10 +78,12 @@ jobs:
|
||||
make build_tfhe_full
|
||||
|
||||
- name: Build Release c_api
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Build coverage tests
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
|
||||
2
.github/workflows/code_coverage.yml
vendored
2
.github/workflows/code_coverage.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
|
||||
5
.github/workflows/gpu_fast_h100_tests.yml
vendored
5
.github/workflows/gpu_fast_h100_tests.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -146,7 +146,8 @@ jobs:
|
||||
|
||||
- name: Run core crypto and internal CUDA backend tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
|
||||
BIG_TESTS_INSTANCE=FALSE make test_core_crypto_gpu
|
||||
BIG_TESTS_INSTANCE=FALSE make test_integer_compression_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
|
||||
|
||||
- name: Run user docs tests
|
||||
|
||||
3
.github/workflows/gpu_fast_tests.yml
vendored
3
.github/workflows/gpu_fast_tests.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -145,6 +145,7 @@ jobs:
|
||||
- name: Run core crypto and internal CUDA backend tests
|
||||
run: |
|
||||
make test_core_crypto_gpu
|
||||
make test_integer_compression_gpu
|
||||
make test_cuda_backend
|
||||
|
||||
- name: Run user docs tests
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -144,6 +144,10 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit CUDA integer compression tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
|
||||
|
||||
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
|
||||
- name: Run multi-bit CUDA integer tests
|
||||
run: |
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -42,7 +42,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -41,7 +41,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
2
.github/workflows/make_release.yml
vendored
2
.github/workflows/make_release.yml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe
|
||||
- uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# Publish new release of tfhe-rs on various platform.
|
||||
name: Publish concrete-csprng release
|
||||
|
||||
on:
|
||||
@@ -37,6 +36,6 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Publish tfhe-versionable release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
publish_release:
|
||||
name: Publish tfhe-versionable Release
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
|
||||
cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
18
Makefile
18
Makefile
@@ -18,7 +18,7 @@ FAST_TESTS?=FALSE
|
||||
FAST_BENCH?=FALSE
|
||||
NIGHTLY_TESTS?=FALSE
|
||||
BENCH_OP_FLAVOR?=DEFAULT
|
||||
NODE_VERSION=22.4
|
||||
NODE_VERSION=22.6
|
||||
FORWARD_COMPAT?=OFF
|
||||
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=v0.1
|
||||
@@ -284,6 +284,9 @@ clippy_c_api: install_rs_check_toolchain
|
||||
|
||||
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
|
||||
clippy_js_wasm_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
@@ -481,6 +484,13 @@ test_integer_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
|
||||
.PHONY: test_integer_compression_gpu
|
||||
test_integer_compression_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::test_gpu_ciphertext_compression
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
|
||||
|
||||
.PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
|
||||
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
@@ -883,6 +893,12 @@ bench_integer_gpu: install_rs_check_toolchain
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_compression_gpu
|
||||
bench_integer_compression_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
|
||||
@@ -159,7 +159,7 @@ To run this code, use the following command:
|
||||
> Note that when running code that uses `TFHE-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible.
|
||||
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
|
||||
@@ -148,10 +148,9 @@ where
|
||||
|
||||
/// Computes one turn of the stream, updating registers and outputting the new bit.
|
||||
pub fn next_bool(&mut self) -> T {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => set_server_key(sk.clone()),
|
||||
None => (),
|
||||
};
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
set_server_key(sk.clone());
|
||||
}
|
||||
|
||||
let [o, a, b, c] = self.get_output_and_values(0);
|
||||
|
||||
@@ -226,18 +225,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let mut values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut ret = Vec::<T>::with_capacity(64);
|
||||
|
||||
@@ -237,18 +237,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut bytes = Vec::<T>::with_capacity(8);
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(clippy::too_long_first_doc_paragraph)]
|
||||
|
||||
mod static_deque;
|
||||
|
||||
mod kreyvium;
|
||||
|
||||
@@ -120,10 +120,9 @@ where
|
||||
|
||||
/// Computes one turn of the stream, updating registers and outputting the new bit.
|
||||
pub fn next_bool(&mut self) -> T {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => set_server_key(sk.clone()),
|
||||
None => (),
|
||||
};
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
set_server_key(sk.clone());
|
||||
}
|
||||
|
||||
let [o, a, b, c] = self.get_output_and_values(0);
|
||||
|
||||
@@ -196,18 +195,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let mut values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut ret = Vec::<T>::with_capacity(64);
|
||||
|
||||
@@ -187,18 +187,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut bytes = Vec::<T>::with_capacity(8);
|
||||
|
||||
@@ -67,9 +67,21 @@ endif()
|
||||
|
||||
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
|
||||
|
||||
# Check if the DEBUG flag is defined
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
# Debug mode
|
||||
message("Compiling in Debug mode")
|
||||
add_definitions(-DDEBUG)
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
|
||||
else()
|
||||
# Release mode
|
||||
message("Compiling in Release mode")
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
--use_fast_math -Xcompiler -fPIC")
|
||||
|
||||
|
||||
159
backends/tfhe-cuda-backend/cuda/include/compression.h
Normal file
159
backends/tfhe-cuda-backend/cuda/include/compression.h
Normal file
@@ -0,0 +1,159 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_H
|
||||
#define CUDA_INTEGER_COMPRESSION_H
|
||||
|
||||
#include "integer.h"
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, void *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
template <typename Torus> struct int_compression {
|
||||
int_radix_params compression_params;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
|
||||
uint32_t body_count;
|
||||
|
||||
// Compression
|
||||
int8_t *fp_ks_buffer;
|
||||
Torus *tmp_lwe;
|
||||
Torus *tmp_glwe_array_out;
|
||||
|
||||
int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
this->compression_params = compression_params;
|
||||
this->lwe_per_glwe = lwe_per_glwe;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->body_count = num_radix_blocks;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
|
||||
tmp_lwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
|
||||
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
streams[0], gpu_indexes[0], &fp_ks_buffer,
|
||||
compression_params.glwe_dimension, compression_params.polynomial_size,
|
||||
num_radix_blocks, true);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
|
||||
cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
|
||||
&fp_ks_buffer);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_decompression {
|
||||
int_radix_params encryption_params;
|
||||
int_radix_params compression_params;
|
||||
|
||||
uint32_t storage_log_modulus;
|
||||
|
||||
uint32_t num_lwes;
|
||||
uint32_t body_count;
|
||||
|
||||
Torus *tmp_extracted_glwe;
|
||||
Torus *tmp_extracted_lwe;
|
||||
|
||||
int_radix_lut<Torus> *carry_extract_lut;
|
||||
|
||||
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t body_count,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
this->encryption_params = encryption_params;
|
||||
this->compression_params = compression_params;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->num_lwes = num_radix_blocks;
|
||||
this->body_count = body_count;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
|
||||
carry_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
tmp_extracted_glwe = (Torus *)cuda_malloc_async(
|
||||
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks *
|
||||
(compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
// Decompression
|
||||
// Carry extract LUT
|
||||
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
|
||||
return x / encryption_params.message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
carry_extract_lut->get_lut(gpu_indexes[0], 0),
|
||||
encryption_params.glwe_dimension, encryption_params.polynomial_size,
|
||||
encryption_params.message_modulus, encryption_params.carry_modulus,
|
||||
carry_extract_f);
|
||||
|
||||
carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
|
||||
|
||||
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete (carry_extract_lut);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -39,10 +39,6 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
bool cuda_check_support_thread_block_clusters();
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
@@ -62,9 +58,13 @@ void cuda_synchronize_device(uint32_t gpu_index);
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
}
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
}
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
bool cuda_check_support_thread_block_clusters();
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
@@ -8,7 +8,7 @@ extern std::mutex m;
|
||||
extern bool p2p_enabled;
|
||||
|
||||
extern "C" {
|
||||
int cuda_setup_multi_gpu();
|
||||
int32_t cuda_setup_multi_gpu();
|
||||
}
|
||||
|
||||
// Define a variant type that can be either a vector or a single pointer
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CUDA_INTEGER_H
|
||||
#define CUDA_INTEGER_H
|
||||
|
||||
#include "keyswitch.h"
|
||||
#include "pbs/programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
@@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE {
|
||||
LEFT_ROTATE = 2,
|
||||
RIGHT_ROTATE = 3
|
||||
};
|
||||
enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
|
||||
enum BITOP_TYPE {
|
||||
BITAND = 0,
|
||||
BITOR = 1,
|
||||
@@ -112,10 +112,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus);
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus);
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
@@ -385,8 +386,8 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t shift);
|
||||
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -475,7 +476,8 @@ struct int_radix_params {
|
||||
message_modulus(message_modulus), carry_modulus(carry_modulus){};
|
||||
|
||||
void print() {
|
||||
printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
|
||||
printf("pbs_type: %u, glwe_dimension: %u, "
|
||||
"polynomial_size: %u, "
|
||||
"big_lwe_dimension: %u, "
|
||||
"small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
|
||||
"%u, pbs_base_log: "
|
||||
@@ -812,7 +814,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
@@ -1356,6 +1357,7 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
|
||||
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
Torus *new_blocks;
|
||||
Torus *new_blocks_copy;
|
||||
Torus *old_blocks;
|
||||
Torus *small_lwe_vector;
|
||||
int_radix_params params;
|
||||
@@ -1383,6 +1385,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
new_blocks = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
new_blocks_copy = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
old_blocks = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
@@ -1414,6 +1419,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
this->new_blocks = new_blocks;
|
||||
this->old_blocks = old_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
new_blocks_copy = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
|
||||
@@ -1432,8 +1440,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
|
||||
scp_mem->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete scp_mem;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -16,6 +16,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **fp_ks_buffer);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -81,14 +81,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
|
||||
uint64_t get_buffer_size_programmable_bootstrap_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -1,17 +1,3 @@
|
||||
set(SOURCES
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
|
||||
@@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
auto lwe_out = lwe_array_out + input_id * lwe_output_size;
|
||||
|
||||
// We assume each GLWE will store the first polynomial_size inputs
|
||||
uint32_t nth_per_glwe = params::degree;
|
||||
auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
|
||||
uint32_t lwe_per_glwe = params::degree;
|
||||
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
|
||||
|
||||
auto nth = nth_array[input_id];
|
||||
|
||||
@@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_glwes,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
dim3 grid(num_glwes);
|
||||
dim3 grid(num_nths);
|
||||
dim3 thds(params::degree / params::opt);
|
||||
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
|
||||
|
||||
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
host_keyswitch_lwe_ciphertext_vector<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
host_keyswitch_lwe_ciphertext_vector<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
@@ -48,3 +48,35 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory) {
|
||||
scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
|
||||
glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
|
||||
}
|
||||
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
|
||||
* ciphertexts.
|
||||
*/
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
|
||||
output_glwe_dimension, output_polynomial_size, base_log, level_count,
|
||||
num_lwes);
|
||||
}
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **fp_ks_buffer) {
|
||||
cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "torus.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
__host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
|
||||
|
||||
// Compute Keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
host_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, current_lwe_array_in,
|
||||
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
|
||||
@@ -154,4 +155,154 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
if (allocate_gpu_memory)
|
||||
*fp_ks_buffer = (int8_t *)cuda_malloc_async(
|
||||
2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
// public functional packing keyswitch for a single LWE ciphertext
|
||||
//
|
||||
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t glwe_size = (glwe_dimension + 1);
|
||||
|
||||
if (tid < glwe_size * polynomial_size) {
|
||||
const int local_index = threadIdx.x;
|
||||
// the output_glwe is split in polynomials and each x-block takes one of
|
||||
// them
|
||||
size_t poly_id = blockIdx.x;
|
||||
size_t coef_per_block = blockDim.x;
|
||||
|
||||
// number of coefficients inside fp-ksk block for each lwe_input coefficient
|
||||
size_t ksk_block_size = glwe_size * polynomial_size * level_count;
|
||||
|
||||
// initialize accumulator to 0
|
||||
glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
|
||||
tid == glwe_dimension * polynomial_size);
|
||||
|
||||
// Iterate through all lwe elements
|
||||
for (int i = 0; i < lwe_dimension_in; i++) {
|
||||
// Round and prepare decomposition
|
||||
Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
|
||||
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
|
||||
Torus mod_b_mask = (1ll << base_log) - 1ll;
|
||||
|
||||
// block of key for current lwe coefficient (cur_input_lwe[i])
|
||||
auto ksk_block = &fp_ksk[i * ksk_block_size];
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
|
||||
// Iterate through each level and multiply by the ksk piece
|
||||
auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
|
||||
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
|
||||
glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// public functional packing keyswitch for a batch of LWE ciphertexts
|
||||
//
|
||||
// Selects the input each thread is working on using the y-block index.
|
||||
//
|
||||
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, Torus *d_mem) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
const int lwe_size = (lwe_dimension_in + 1);
|
||||
|
||||
const int input_id = blockIdx.y;
|
||||
const int degree = input_id;
|
||||
|
||||
// Select an input
|
||||
auto lwe_in = lwe_array_in + input_id * lwe_size;
|
||||
auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
|
||||
auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
|
||||
// KS LWE to GLWE
|
||||
packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
|
||||
polynomial_size, base_log, level_count);
|
||||
|
||||
// P * x ^degree
|
||||
auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
|
||||
auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
|
||||
polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
|
||||
tid % polynomial_size,
|
||||
polynomial_size, 1, true);
|
||||
}
|
||||
|
||||
/// To-do: Rewrite this kernel for efficiency
|
||||
template <typename Torus>
|
||||
__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t num_lwes) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < (glwe_dimension + 1) * polynomial_size) {
|
||||
glwe_out[tid] = glwe_array_in[tid];
|
||||
|
||||
// Accumulate
|
||||
for (int i = 1; i < num_lwes; i++) {
|
||||
auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
|
||||
glwe_out[tid] += glwe_in[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
|
||||
Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
|
||||
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_lwes) {
|
||||
cudaSetDevice(gpu_index);
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
|
||||
|
||||
dim3 grid(num_blocks, num_lwes);
|
||||
dim3 threads(num_threads);
|
||||
|
||||
auto d_mem = (Torus *)fp_ks_buffer;
|
||||
auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
|
||||
|
||||
// individually keyswitch each lwe
|
||||
packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
|
||||
d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// accumulate to a single glwe
|
||||
accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
|
||||
glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
|
||||
num_lwes);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "types/int128.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename T>
|
||||
@@ -29,20 +30,18 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
|
||||
const T shift = non_rep_bit_count - 1;
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
return res;
|
||||
res += 1;
|
||||
res &= (T)(-2);
|
||||
return res << shift;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void modulus_switch(T input, T &output,
|
||||
uint32_t log_modulus) {
|
||||
constexpr uint32_t BITS = sizeof(T) * 8;
|
||||
|
||||
output = input + (((T)1) << (BITS - log_modulus - 1));
|
||||
output >>= (BITS - log_modulus);
|
||||
}
|
||||
@@ -54,4 +53,27 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
|
||||
return output;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void modulus_switch_inplace(Torus *array, int size,
|
||||
uint32_t log_modulus) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < size) {
|
||||
array[tid] = modulus_switch(array[tid], log_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
|
||||
uint32_t gpu_index, Torus *array,
|
||||
int size, uint32_t log_modulus) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
|
||||
|
||||
modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
|
||||
log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
|
||||
@@ -177,8 +177,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
|
||||
n);
|
||||
cuda_set_value_kernel<Torus>
|
||||
<<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,12 +37,12 @@ void host_resolve_signed_overflow(
|
||||
streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, x, mem->params.big_lwe_dimension,
|
||||
1);
|
||||
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, last_block_input_carry,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, x,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, last_block_input_carry,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
|
||||
last_block_inner_propagation,
|
||||
@@ -94,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
|
||||
// phase 1
|
||||
if (op == SIGNED_OPERATION::ADDITION) {
|
||||
host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
} else {
|
||||
host_integer_radix_negation(
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
|
||||
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
|
||||
host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
}
|
||||
|
||||
// phase 2
|
||||
@@ -109,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
result, output_carry, input_carries,
|
||||
mem_ptr->scp_mem, bsks, ksks, num_blocks);
|
||||
host_generate_last_block_inner_propagation(
|
||||
host_propagate_single_carry<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
|
||||
input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
|
||||
host_generate_last_block_inner_propagation<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
|
||||
&rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
|
||||
@@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
// phase 3
|
||||
auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
|
||||
|
||||
host_resolve_signed_overflow(
|
||||
host_resolve_signed_overflow<Torus>(
|
||||
streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
|
||||
input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
scratch_cuda_integer_radix_cmux_kb(
|
||||
scratch_cuda_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
|
||||
@@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
|
||||
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
|
||||
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
|
||||
lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
|
||||
params.message_modulus, 1);
|
||||
device_pack_bivariate_blocks<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out_block, predicate->lwe_indexes_in,
|
||||
lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
|
||||
params.big_lwe_dimension, params.message_modulus, 1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
}
|
||||
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
|
||||
zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
|
||||
lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
|
||||
mem_false, mem_ptr->predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
|
||||
// have kept its value
|
||||
auto added_cts = mem_ptr->tmp_true_ct;
|
||||
host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
|
||||
mem_ptr->tmp_false_ct, params.big_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
|
||||
mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
|
||||
params.big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
|
||||
|
||||
@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
// Add all blocks and store in sum
|
||||
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
|
||||
device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
output, input, lwe_dimension, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true(
|
||||
auto is_equal_to_num_blocks_map =
|
||||
&are_all_block_true_buffer->is_equal_to_lut_map;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension, chunk_length);
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension,
|
||||
chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
@@ -165,7 +165,6 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -192,8 +191,9 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
auto input_blocks = mem_ptr->tmp_lwe_array_out;
|
||||
auto accumulator = buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension, chunk_length);
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension,
|
||||
chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
@@ -280,8 +280,8 @@ __host__ void host_compare_with_zero_equality(
|
||||
uint32_t chunk_size =
|
||||
std::min(remainder_blocks, num_elements_to_fill_carry);
|
||||
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
|
||||
big_lwe_dimension, chunk_size);
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
|
||||
big_lwe_dimension, chunk_size);
|
||||
|
||||
num_sum_blocks++;
|
||||
remainder_blocks -= (chunk_size - 1);
|
||||
@@ -295,8 +295,9 @@ __host__ void host_compare_with_zero_equality(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
|
||||
zero_comparison);
|
||||
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
sum, mem_ptr, bsks, ksks, num_sum_blocks);
|
||||
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, sum, mem_ptr, bsks, ksks,
|
||||
num_sum_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -310,7 +311,7 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
|
||||
// Applies the LUT for the comparison operation
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
|
||||
bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
|
||||
eq_buffer->operator_lut->params.message_modulus);
|
||||
@@ -319,9 +320,9 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
//
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, comparisons, mem_ptr,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -352,19 +353,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, big_lwe_dimension, num_radix_blocks);
|
||||
host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, big_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
|
||||
num_radix_blocks, is_non_zero_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
}
|
||||
@@ -406,8 +408,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
||||
while (partial_block_count > 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
|
||||
@@ -433,8 +435,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
std::function<Torus(Torus)> f;
|
||||
|
||||
if (partial_block_count == 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
|
||||
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
@@ -454,9 +456,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, lwe_array_out, y,
|
||||
bsks, ksks, 1, last_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
|
||||
last_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -488,19 +490,21 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (mem_ptr->is_signed) {
|
||||
packed_num_radix_blocks -= 2;
|
||||
}
|
||||
pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
|
||||
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
|
||||
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
|
||||
big_lwe_dimension, packed_num_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
|
||||
lwe_array_right, big_lwe_dimension,
|
||||
packed_num_radix_blocks, message_modulus);
|
||||
// From this point we have half number of blocks
|
||||
packed_num_radix_blocks /= 2;
|
||||
|
||||
// Clean noise
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
|
||||
packed_num_radix_blocks, identity_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
|
||||
packed_num_radix_blocks, identity_lut);
|
||||
|
||||
@@ -517,16 +521,17 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (!mem_ptr->is_signed) {
|
||||
// Compare packed blocks, or simply the total number of radix blocks in the
|
||||
// inputs
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
|
||||
rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lhs, rhs, mem_ptr, bsks, ksks,
|
||||
packed_num_radix_blocks);
|
||||
num_comparisons = packed_num_radix_blocks;
|
||||
} else {
|
||||
// Packing is possible
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
|
||||
rhs, mem_ptr, bsks, ksks,
|
||||
packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
|
||||
packed_num_radix_blocks);
|
||||
|
||||
// Compare the last block before the sign block separately
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
@@ -535,21 +540,21 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
Torus *last_right_block_before_sign_block =
|
||||
diff_buffer->tmp_packed_right +
|
||||
packed_num_radix_blocks * big_lwe_size;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
|
||||
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
|
||||
identity_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
|
||||
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
|
||||
1, identity_lut);
|
||||
compare_radix_blocks_kb(
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + packed_num_radix_blocks * big_lwe_size,
|
||||
last_left_block_before_sign_block, last_right_block_before_sign_block,
|
||||
mem_ptr, bsks, ksks, 1);
|
||||
// Compare the sign block separately
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
|
||||
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
|
||||
@@ -558,11 +563,11 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_comparisons = packed_num_radix_blocks + 2;
|
||||
|
||||
} else {
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, bsks,
|
||||
ksks, num_radix_blocks - 1);
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
|
||||
// Compare the sign block separately
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + (num_radix_blocks - 1) * big_lwe_size,
|
||||
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
|
||||
@@ -575,9 +580,9 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks, num_comparisons);
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks, num_comparisons);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -601,16 +606,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
Torus **ksks, uint32_t total_num_radix_blocks) {
|
||||
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb(
|
||||
host_integer_radix_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, total_num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
#include "compression.cuh"
|
||||
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
(compression_glwe_dimension + 1) * compression_polynomial_size,
|
||||
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
// Decompression doesn't keyswitch, so big and small dimensions are the same
|
||||
int_radix_params encryption_params(
|
||||
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
|
||||
lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
|
||||
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
|
||||
encryption_params, compression_params, storage_log_modulus,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr) {
|
||||
|
||||
host_integer_compress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
|
||||
(int_compression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, void *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_decompress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
|
||||
static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
|
||||
(int_decompression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_compression<uint64_t> *mem_ptr =
|
||||
(int_compression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_decompression<uint64_t> *mem_ptr =
|
||||
(int_decompression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
@@ -0,0 +1,277 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_CUH
|
||||
#define CUDA_INTEGER_COMPRESSION_CUH
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "compression.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linearalgebra/multiplication.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
uint32_t in_len, uint32_t len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
|
||||
auto i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i < len) {
|
||||
auto k = nbits * i / log_modulus;
|
||||
auto j = k;
|
||||
|
||||
auto start_shift = i * nbits - j * log_modulus;
|
||||
|
||||
auto value = array_in[j] >> start_shift;
|
||||
j++;
|
||||
|
||||
while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
|
||||
auto shift = j * log_modulus - i * nbits;
|
||||
value |= array_in[j] << shift;
|
||||
j++;
|
||||
}
|
||||
|
||||
array_out[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *array_out, Torus *array_in, uint32_t num_inputs,
|
||||
uint32_t body_count, int_compression<Torus> *mem_ptr) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
|
||||
auto number_bits_to_pack = in_len * log_modulus;
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
// number_bits_to_pack.div_ceil(Scalar::BITS)
|
||||
auto len = (number_bits_to_pack + nbits - 1) / nbits;
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
|
||||
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
|
||||
in_len, len);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_compress(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus **fp_ksk, uint32_t num_lwes,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
auto input_lwe_dimension = compression_params.small_lwe_dimension;
|
||||
|
||||
// Shift
|
||||
auto lwe_shifted = mem_ptr->tmp_lwe;
|
||||
host_cleartext_multiplication<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
|
||||
(uint64_t)compression_params.message_modulus, input_lwe_dimension,
|
||||
num_lwes);
|
||||
|
||||
uint32_t lwe_in_size = input_lwe_dimension + 1;
|
||||
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
|
||||
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
|
||||
|
||||
// Keyswitch LWEs to GLWE
|
||||
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
|
||||
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
|
||||
for (int i = 0; i < num_glwes; i++) {
|
||||
auto lwe_subset = lwe_shifted + i * lwe_in_size;
|
||||
auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
|
||||
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
|
||||
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size, compression_params.ks_base_log,
|
||||
compression_params.ks_level, body_count);
|
||||
}
|
||||
|
||||
// Modulus switch
|
||||
host_modulus_switch_inplace<Torus>(
|
||||
streams[0], gpu_indexes[0], tmp_glwe_array_out,
|
||||
num_glwes * (compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
body_count),
|
||||
mem_ptr->storage_log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
|
||||
tmp_glwe_array_out, num_glwes, body_count, mem_ptr);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
|
||||
uint32_t log_modulus, uint32_t initial_out_len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
|
||||
auto i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (i < initial_out_len) {
|
||||
// Unpack
|
||||
Torus mask = ((Torus)1 << log_modulus) - 1;
|
||||
auto start = i * log_modulus;
|
||||
auto end = (i + 1) * log_modulus;
|
||||
|
||||
auto start_block = start / nbits;
|
||||
auto start_remainder = start % nbits;
|
||||
|
||||
auto end_block_inclusive = (end - 1) / nbits;
|
||||
|
||||
Torus unpacked_i;
|
||||
if (start_block == end_block_inclusive) {
|
||||
auto single_part = array_in[start_block] >> start_remainder;
|
||||
unpacked_i = single_part & mask;
|
||||
} else {
|
||||
auto first_part = array_in[start_block] >> start_remainder;
|
||||
auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
|
||||
|
||||
unpacked_i = (first_part | second_part) & mask;
|
||||
}
|
||||
|
||||
// Extract
|
||||
glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *glwe_array_out, Torus *array_in,
|
||||
uint32_t glwe_index,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
auto params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
|
||||
uint32_t body_count = mem_ptr->body_count;
|
||||
|
||||
auto initial_out_len =
|
||||
params.glwe_dimension * params.polynomial_size + body_count;
|
||||
|
||||
// We assure the tail of the glwe is zeroed
|
||||
auto zeroed_slice = glwe_array_out + initial_out_len;
|
||||
cuda_memset_async(zeroed_slice, 0,
|
||||
(params.polynomial_size - body_count) * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
extract<Torus><<<grid, threads, 0, stream>>>(
|
||||
glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *packed_glwe_in, uint32_t *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
|
||||
auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
|
||||
packed_glwe_in, 0, mem_ptr);
|
||||
|
||||
auto num_lwes = mem_ptr->num_lwes;
|
||||
|
||||
// Sample extract
|
||||
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
|
||||
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
|
||||
extracted_glwe, indexes_array, indexes_array_size,
|
||||
compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
auto encryption_params = mem_ptr->encryption_params;
|
||||
auto lut = mem_ptr->carry_extract_lut;
|
||||
auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_out,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
|
||||
lut->lwe_indexes_in, bsks, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_lwes, encryption_params.pbs_type);
|
||||
} else {
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
/// Apply PBS
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_lwes, encryption_params.pbs_type);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_out, lwe_after_pbs_vec,
|
||||
lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_lwes,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_compress_integer_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_compression<Torus> **mem_ptr, uint32_t num_lwes,
|
||||
int_radix_params compression_params, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_lwes,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
|
||||
int_radix_params encryption_params, int_radix_params compression_params,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
num_lwes, body_count, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
#endif
|
||||
@@ -282,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// Shift the mask so that we will only keep bits we should
|
||||
uint32_t shifted_mask = full_message_mask >> shift_amount;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
|
||||
interesting_divisor.last_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_1[shifted_mask]);
|
||||
@@ -310,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// the estimated degree of the output is < msg_modulus
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
|
||||
divisor_ms_blocks.first_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_2[shifted_mask]);
|
||||
@@ -334,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
interesting_remainder1.insert(0, numerator_block_1.first_block(),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
|
||||
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
|
||||
|
||||
@@ -342,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
interesting_remainder1.len - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
host_radix_blocks_rotate_left(
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
|
||||
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
|
||||
|
||||
@@ -363,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
auto left_shift_interesting_remainder2 =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
|
||||
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
|
||||
}; // left_shift_interesting_remainder2
|
||||
@@ -396,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// but in that position, interesting_remainder2 always has a 0
|
||||
auto &merged_interesting_remainder = interesting_remainder1;
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
|
||||
merged_interesting_remainder.data,
|
||||
interesting_remainder2.data, radix_params.big_lwe_dimension,
|
||||
merged_interesting_remainder.len);
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], merged_interesting_remainder.data,
|
||||
merged_interesting_remainder.data, interesting_remainder2.data,
|
||||
radix_params.big_lwe_dimension, merged_interesting_remainder.len);
|
||||
|
||||
// after create_clean_version_of_merged_remainder
|
||||
// `merged_interesting_remainder` will be reused as
|
||||
@@ -439,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// We could call unchecked_scalar_ne
|
||||
// But we are in the special case where scalar == 0
|
||||
// So we can skip some stuff
|
||||
host_compare_with_zero_equality(
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
@@ -447,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
tmp_1.len =
|
||||
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
|
||||
|
||||
is_at_least_one_comparisons_block_true(
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
|
||||
@@ -460,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// `cleaned_merged_interesting_remainder` - radix ciphertext
|
||||
auto create_clean_version_of_merged_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
cleaned_merged_interesting_remainder.data, bsks, ksks,
|
||||
@@ -486,10 +486,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
|
||||
subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
|
||||
subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
int factor = (i) ? 3 : 2;
|
||||
int factor_lut_id = factor - 2;
|
||||
@@ -528,10 +528,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block]
|
||||
->params.message_modulus);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0],
|
||||
"ient[block_of_bit * big_lwe_size],
|
||||
"ient[block_of_bit * big_lwe_size],
|
||||
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], "ient[block_of_bit * big_lwe_size],
|
||||
"ient[block_of_bit * big_lwe_size], did_not_overflow.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
@@ -564,17 +564,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
// Clean the quotient and remainder
|
||||
// as even though they have no carries, they are not at nominal noise level
|
||||
host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
|
||||
remainder2.data, radix_params.big_lwe_dimension,
|
||||
remainder1.len);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
|
||||
remainder2.data, radix_params.big_lwe_dimension,
|
||||
remainder1.len);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
|
||||
ksks, num_blocks, mem_ptr->message_extract_lut_2);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
|
||||
@@ -53,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
@@ -195,15 +195,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
|
||||
int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe), params,
|
||||
static_cast<uint64_t *>(generates_or_propagates), params,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
"pointers should be different");
|
||||
}
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
dst, src, value, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
"pointers should be different");
|
||||
}
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
dst, src, value, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
@@ -124,8 +124,8 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t lwe_size) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
int num_blocks = blocks_count / 2, num_threads = 1024;
|
||||
radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
src, blocks_count, lwe_size);
|
||||
radix_blocks_reverse_lwe_inplace<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
@@ -164,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
|
||||
lwe_dimension, shift, num_radix_blocks);
|
||||
device_pack_bivariate_blocks<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
|
||||
lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -273,10 +274,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
|
||||
// Left message is shifted
|
||||
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
|
||||
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
|
||||
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
|
||||
lut->lwe_indexes_in, big_lwe_dimension, shift,
|
||||
num_radix_blocks);
|
||||
pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_pbs_in, lut->lwe_trivial_indexes,
|
||||
lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
|
||||
big_lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
@@ -380,7 +381,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
|
||||
body[i] = -body[i];
|
||||
}
|
||||
|
||||
rotate_left(body, half_box_size, polynomial_size);
|
||||
rotate_left<Torus>(body, half_box_size, polynomial_size);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -590,13 +591,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
ksks, num_blocks, luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
host_compute_prefix_sum_hillis_steele(
|
||||
host_compute_prefix_sum_hillis_steele<Torus>(
|
||||
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
|
||||
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
|
||||
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
|
||||
generates_or_propagates, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
step_output, generates_or_propagates, 1,
|
||||
num_blocks, big_lwe_size);
|
||||
if (carry_out != nullptr) {
|
||||
cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
@@ -610,8 +611,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
|
||||
glwe_dimension * polynomial_size, num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
step_output, glwe_dimension * polynomial_size,
|
||||
num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
|
||||
@@ -664,14 +666,15 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
|
||||
generates_or_propagates, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
step_output, generates_or_propagates, 1,
|
||||
num_blocks, big_lwe_size);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
step_output, glwe_dimension * polynomial_size, num_blocks);
|
||||
host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
step_output, glwe_dimension * polynomial_size,
|
||||
num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
|
||||
@@ -727,10 +730,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
if (i < num_blocks - 1) {
|
||||
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
|
||||
host_addition(streams[0], gpu_indexes[0], next_input_block,
|
||||
next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
|
||||
next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
params.big_lwe_dimension, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -765,7 +768,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
}
|
||||
|
||||
if (num_radix_blocks % 2 == 1) {
|
||||
// We couldn't pack the last block, so we just copy it
|
||||
// We couldn't host_pack the last block, so we just copy it
|
||||
Torus *lsb_block =
|
||||
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
|
||||
Torus *last_block =
|
||||
@@ -794,7 +797,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
|
||||
device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
|
||||
device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
|
||||
}
|
||||
|
||||
@@ -840,7 +843,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_create_trivial_radix<<<grid, thds, 0, stream>>>(
|
||||
device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -857,7 +860,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t num_radix_blocks, uint32_t bits_per_block,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
|
||||
num_radix_blocks * bits_per_block, bit_extract->lut);
|
||||
}
|
||||
@@ -870,7 +873,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks,
|
||||
Torus **ksks, uint32_t num_sign_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
@@ -904,9 +906,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
while (num_sign_blocks > 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
big_lwe_dimension, num_sign_blocks, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
big_lwe_dimension, num_sign_blocks, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
|
||||
num_sign_blocks / 2, lut);
|
||||
|
||||
@@ -937,11 +939,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
final_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
|
||||
2, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, signs_array_out,
|
||||
signs_b, bsks, ksks, 1, lut);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
big_lwe_dimension, 2, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
|
||||
1, lut);
|
||||
|
||||
} else {
|
||||
|
||||
@@ -957,9 +959,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
final_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, signs_array_out,
|
||||
signs_a, bsks, ksks, 1, lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
|
||||
1, lut);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -241,7 +241,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -249,7 +250,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -257,7 +259,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -265,7 +268,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -273,7 +277,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
@@ -281,7 +286,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
|
||||
@@ -186,9 +186,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
|
||||
uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut = nullptr) {
|
||||
int_radix_lut<Torus> *reused_lut) {
|
||||
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto new_blocks_copy = mem_ptr->new_blocks_copy;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
|
||||
@@ -205,12 +206,27 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto small_lwe_size = small_lwe_dimension + 1;
|
||||
|
||||
if (num_radix_in_vec == 0)
|
||||
return;
|
||||
if (num_radix_in_vec == 1) {
|
||||
cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
|
||||
num_blocks_in_radix * big_lwe_size *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
return;
|
||||
}
|
||||
if (old_blocks != terms) {
|
||||
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
|
||||
num_blocks_in_radix * num_radix_in_vec *
|
||||
big_lwe_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
if (num_radix_in_vec == 2) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size],
|
||||
big_lwe_dimension, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t r = num_radix_in_vec;
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
@@ -271,7 +287,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
@@ -288,7 +303,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
|
||||
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
|
||||
total_count, message_count, carry_count, sm_copy_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
@@ -303,8 +317,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
// inside d_smart_copy_in there are only -1 values
|
||||
// it's fine to call smart_copy with same pointer
|
||||
// as source and destination
|
||||
smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
|
||||
cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
|
||||
r * num_blocks * big_lwe_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
|
||||
big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -422,9 +439,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete (luts_message_carry);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
|
||||
num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size],
|
||||
big_lwe_dimension, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_negation(
|
||||
host_integer_radix_negation<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
|
||||
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
|
||||
@@ -25,14 +25,13 @@ template <typename Torus>
|
||||
__global__ void
|
||||
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
uint64_t lwe_dimension, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint64_t delta) {
|
||||
uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < lwe_dimension + 1) {
|
||||
bool is_body = (tid == lwe_dimension);
|
||||
|
||||
// z = ceil( degree / 2^p ) * 2^p
|
||||
uint64_t z = (2 * message_modulus - 1) / message_modulus;
|
||||
__syncthreads();
|
||||
z *= message_modulus;
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
@@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
|
||||
uint64_t encoded_zb = zb * delta;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
output[tid] =
|
||||
(is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
|
||||
device_integer_radix_negation<<<grid, thds, 0, streams[0]>>>(
|
||||
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
|
||||
carry_modulus, delta);
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb(
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
host_unchecked_sub_with_correcting_term(
|
||||
host_unchecked_sub_with_correcting_term<Torus>(
|
||||
streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
|
||||
radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
|
||||
radix_params.message_modulus, radix_params.carry_modulus,
|
||||
|
||||
@@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace(
|
||||
host_integer_radix_scalar_addition_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
|
||||
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
|
||||
@@ -45,9 +45,10 @@ __host__ void host_integer_radix_scalar_addition_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
device_integer_radix_scalar_addition_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
|
||||
input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -83,8 +84,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
|
||||
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
|
||||
device_integer_radix_add_scalar_one_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
|
||||
lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -122,10 +124,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
|
||||
streams[0]>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
device_integer_radix_scalar_subtraction_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
|
||||
input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3,6 +3,58 @@
|
||||
|
||||
#include "integer/comparison.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
||||
// be set to 1
|
||||
// meaning that the output of the pbs will be the negative (modulo message
|
||||
// space)
|
||||
//
|
||||
// Example:
|
||||
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
||||
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
||||
// Since there was an overflow the bit of padding is 1 and not 0.
|
||||
// When applying the LUT for an input value of 14 we would expect 1,
|
||||
// but since the bit of padding is 1, we will get -1 modulus our message
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
|
||||
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
|
||||
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
|
||||
ksks, num_radix_blocks, sign_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -45,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
if (total_num_scalar_blocks == 0) {
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
|
||||
@@ -91,10 +143,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -106,22 +159,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
@@ -145,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
scalar_bivariate_last_leaf_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);
|
||||
|
||||
@@ -159,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -173,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lhs, rhs, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
|
||||
host_compare_with_zero_equality(
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
Torus *sign_block =
|
||||
@@ -277,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
scalar_bivariate_last_leaf_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
|
||||
sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);
|
||||
|
||||
@@ -304,10 +359,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -319,24 +375,24 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
Torus *are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
|
||||
@@ -371,7 +427,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
|
||||
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
@@ -382,8 +438,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
2);
|
||||
|
||||
} else {
|
||||
// We only have to do the regular comparison
|
||||
@@ -403,10 +460,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -415,19 +473,19 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr,
|
||||
bsks, ksks, num_lsb_radix_blocks);
|
||||
Torus *encrypted_sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
|
||||
scalar_sign_block, big_lwe_dimension, 1, 1,
|
||||
message_modulus, carry_modulus);
|
||||
create_trivial_radix<Torus>(
|
||||
msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
|
||||
big_lwe_dimension, 1, 1, message_modulus, carry_modulus);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
|
||||
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
|
||||
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
|
||||
@@ -439,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks + 1);
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -452,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
// Calculates the difference sign between the ciphertext and the scalar
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
integer_radix_signed_scalar_difference_check_kb(
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
@@ -469,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
auto lwe_array_left = lwe_array_in;
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
sign, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, sign, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -492,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_difference_check_kb(
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_difference_check_kb(
|
||||
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
@@ -513,70 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_maxmin_kb(
|
||||
integer_radix_signed_scalar_maxmin_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_maxmin_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
PANIC("Cuda error: only signed scalar maxmin can be called in signed "
|
||||
"scalar comparison")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
||||
// be set to 1
|
||||
// meaning that the output of the pbs will be the negative (modulo message
|
||||
// space)
|
||||
//
|
||||
// Example:
|
||||
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
||||
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
||||
// Since there was an overflow the bit of padding is 1 and not 0.
|
||||
// When applying the LUT for an input value of 14 we would expect 1,
|
||||
// but since the bit of padding is 1, we will get -1 modulus our message
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
|
||||
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace(
|
||||
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
|
||||
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
|
||||
ksks, num_radix_blocks, sign_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -591,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
host_integer_radix_scalar_difference_check_kb(
|
||||
host_integer_radix_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
@@ -601,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
auto lwe_array_left = lwe_array_in;
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -659,10 +662,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
auto packed_scalar =
|
||||
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
|
||||
scalar_blocks, 0, num_scalar_blocks, message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
|
||||
@@ -670,7 +674,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
gpu_indexes[0]);
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
|
||||
bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
|
||||
}
|
||||
@@ -689,9 +693,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, msb_lut);
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, msb_lut);
|
||||
}
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
@@ -701,13 +705,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
are_all_comparisons_block_true(
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
mem_ptr, bsks, ksks,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
is_at_least_one_comparisons_block_true(
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
mem_ptr, bsks, ksks,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
|
||||
@@ -65,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
|
||||
lwe_size_bytes * num_radix_blocks,
|
||||
streams[0], gpu_indexes[0]);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
|
||||
streams, gpu_indexes, gpu_count, ptr, shift_amount,
|
||||
mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
|
||||
} else {
|
||||
@@ -82,9 +82,9 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
|
||||
T *block_shift_buffer =
|
||||
all_shifted_buffer + j * num_radix_blocks * lwe_size;
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
block_shift_buffer, preshifted_radix_ct,
|
||||
i / msg_bits, num_radix_blocks, lwe_size);
|
||||
host_radix_blocks_rotate_right<T>(
|
||||
streams, gpu_indexes, gpu_count, block_shift_buffer,
|
||||
preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
@@ -108,7 +108,7 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
|
||||
terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j);
|
||||
num_radix_blocks, j, nullptr);
|
||||
|
||||
auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
|
||||
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
|
||||
|
||||
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
@@ -70,9 +70,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
} else {
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
@@ -97,8 +97,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
|
||||
lwe_array, 1, num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
|
||||
@@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
|
||||
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
} else {
|
||||
// right shift
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
// create trivial assign for value = 0
|
||||
@@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
Torus *last_block_copy = &padding_block[big_lwe_size];
|
||||
|
||||
if (mem->shift_type == RIGHT_SHIFT) {
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
@@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
}
|
||||
auto lut_univariate_padding_block =
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
|
||||
// Replace blocks 'pulled' from the left with the correct padding
|
||||
@@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
if (shift_within_block != 0) {
|
||||
auto lut_univariate_shift_last_block =
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
|
||||
}
|
||||
|
||||
@@ -88,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
switch (mem->shift_type) {
|
||||
case LEFT_SHIFT:
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
|
||||
if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
|
||||
for (int i = 0; i < rotations; i++)
|
||||
@@ -103,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
break;
|
||||
case RIGHT_SHIFT:
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
|
||||
if (mem->is_signed)
|
||||
for (int i = 0; i < rotations; i++)
|
||||
@@ -119,38 +119,39 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
break;
|
||||
case LEFT_ROTATE:
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
break;
|
||||
case RIGHT_ROTATE:
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Unknown operation")
|
||||
}
|
||||
|
||||
// pack bits into one block so that we have
|
||||
// host_pack bits into one block so that we have
|
||||
// control_bit|b|a
|
||||
cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]); // Do we need this?
|
||||
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
|
||||
mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
|
||||
mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
|
||||
total_nb_bits);
|
||||
pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
|
||||
mux_lut->lwe_indexes_out, rotated_input,
|
||||
input_bits_a, mux_lut->lwe_indexes_in,
|
||||
big_lwe_dimension, 2, total_nb_bits);
|
||||
|
||||
// The shift bit is already properly aligned/positioned
|
||||
for (int i = 0; i < total_nb_bits; i++)
|
||||
host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
|
||||
mux_inputs + i * big_lwe_size, shift_bit,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0],
|
||||
mux_inputs + i * big_lwe_size,
|
||||
mux_inputs + i * big_lwe_size, shift_bit,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
// we have
|
||||
// control_bit|b|a
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
|
||||
total_nb_bits, mux_lut);
|
||||
}
|
||||
@@ -179,8 +180,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
auto bit_to_add = input_bits_a + i * big_lwe_size;
|
||||
|
||||
for (int j = 0; j < num_radix_blocks; j++) {
|
||||
host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
|
||||
big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
|
||||
big_lwe_dimension, 1);
|
||||
|
||||
block += big_lwe_size;
|
||||
bit_to_add += bits_per_block * big_lwe_size;
|
||||
@@ -188,7 +189,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
|
||||
// To give back a clean ciphertext
|
||||
auto cleaning_lut = mem->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
|
||||
num_radix_blocks, cleaning_lut);
|
||||
}
|
||||
|
||||
@@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u32 input LWE ciphertext vector with a u32
|
||||
@@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
host_addition_plaintext<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
|
||||
@@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
host_addition_plaintext<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
|
||||
(lwe_dimension + 1) * lwe_ciphertext_count,
|
||||
stream, gpu_index);
|
||||
plaintext_addition<<<grid, thds, 0, stream>>>(
|
||||
plaintext_addition<T><<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
addition<T><<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
subtraction<T>
|
||||
<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
|
||||
(input_lwe_dimension + 1) * sizeof(T),
|
||||
stream, gpu_index);
|
||||
|
||||
radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
|
||||
radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
|
||||
output, plaintext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term(
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
|
||||
__host__ void host_unchecked_sub_with_correcting_term(
|
||||
cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
@@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
|
||||
unchecked_sub_with_correcting_term<T><<<grid, thds, 0, stream>>>(
|
||||
output, input_1, input_2, num_entries, lwe_size, message_modulus,
|
||||
carry_modulus, degree);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_cleartext_vec_multiplication<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
|
||||
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_cleartext_vec_multiplication<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -14,9 +14,10 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
|
||||
T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = lwe_input[index] * cleartext_input;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
T *output, T *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
@@ -45,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_multiplication<<<grid, thds, 0, stream>>>(
|
||||
cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
|
||||
negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -1,15 +1,5 @@
|
||||
#include "programmable_bootstrap_amortized.cuh"
|
||||
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 32 bits inputs, into `buffer`. It also
|
||||
|
||||
@@ -256,7 +256,7 @@ __host__ void execute_cg_external_product_loop(
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_chunk_size, int lwe_offset) {
|
||||
uint32_t lwe_chunk_size, uint32_t lwe_offset) {
|
||||
|
||||
uint64_t full_dm =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
@@ -275,6 +275,8 @@ __host__ void execute_cg_external_product_loop(
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
if (chunk_size == 0)
|
||||
return;
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_cg;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
|
||||
@@ -182,25 +182,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
uint64_t get_buffer_size_programmable_bootstrap_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count))
|
||||
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count);
|
||||
else
|
||||
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
|
||||
@@ -18,9 +18,9 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
|
||||
uint32_t ggsw_idx,
|
||||
uint32_t grouping_factor) {
|
||||
__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
|
||||
uint32_t ggsw_idx,
|
||||
uint32_t grouping_factor) {
|
||||
Torus x = 0;
|
||||
for (int i = 0; i < grouping_factor; i++) {
|
||||
uint32_t mask_position = grouping_factor - (i + 1);
|
||||
@@ -31,6 +31,13 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
|
||||
return modulus_switch(x, params::log2_degree + 1);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int
|
||||
get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
|
||||
uint32_t level_count) {
|
||||
return polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
level_count;
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
@@ -60,8 +67,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
uint32_t input_idx = blockIdx.x / lwe_chunk_size;
|
||||
|
||||
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
|
||||
//
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
@@ -81,56 +86,52 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
const Torus *bsk_poly_ini = bsk_slice + poly_id * params::degree;
|
||||
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
bsk_poly, accumulator);
|
||||
Torus reg_acc[params::opt];
|
||||
|
||||
copy_polynomial_in_regs<Torus, params::opt, params::degree / params::opt>(
|
||||
bsk_poly_ini, reg_acc);
|
||||
|
||||
int offset =
|
||||
get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
|
||||
|
||||
// Precalculate the monomial degrees and store them in shared memory
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Accumulate the other terms
|
||||
for (int g = 1; g < (1 << grouping_factor); g++) {
|
||||
|
||||
const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
uint32_t monomial_degree = monomial_degrees[g];
|
||||
|
||||
// Calculates the monomial degree
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, g, grouping_factor);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
const Torus *bsk_poly = bsk_poly_ini + g * offset;
|
||||
// Multiply by the bsk element
|
||||
polynomial_product_accumulate_by_monomial<Torus, params>(
|
||||
accumulator, bsk_poly, monomial_degree, false);
|
||||
polynomial_product_accumulate_by_monomial_nosync<Torus, params>(
|
||||
reg_acc, bsk_poly, monomial_degree);
|
||||
}
|
||||
synchronize_threads_in_block(); // needed because we are going to reuse the
|
||||
// shared memory for the fft
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Move accumulator to local memory
|
||||
double2 temp[params::opt / 2];
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
|
||||
temp[i].y =
|
||||
__ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
|
||||
temp[i].x /= (double)std::numeric_limits<Torus>::max();
|
||||
temp[i].y /= (double)std::numeric_limits<Torus>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Move from local memory back to shared memory but as complex
|
||||
tid = threadIdx.x;
|
||||
int tid = threadIdx.x;
|
||||
double2 *fft = (double2 *)selected_memory;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = temp[i];
|
||||
fft[tid] =
|
||||
make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
|
||||
(double)std::numeric_limits<Torus>::max(),
|
||||
__ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) /
|
||||
(double)std::numeric_limits<Torus>::max());
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
|
||||
// lwe iteration
|
||||
@@ -464,7 +465,7 @@ __host__ void execute_compute_keybundle(
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_chunk_size, int lwe_offset) {
|
||||
uint32_t lwe_chunk_size, uint32_t lwe_offset) {
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
@@ -505,14 +506,12 @@ __host__ void execute_compute_keybundle(
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int j, int lwe_offset) {
|
||||
__host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
|
||||
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
@@ -561,14 +560,12 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
int32_t grouping_factor, uint32_t level_count,
|
||||
int j, int lwe_offset, uint32_t lwe_chunk_size) {
|
||||
__host__ void execute_step_two(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
|
||||
uint32_t j, uint32_t lwe_offset, uint32_t lwe_chunk_size) {
|
||||
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
@@ -626,7 +623,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
// Accumulate
|
||||
uint32_t chunk_size = std::min(
|
||||
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
for (int j = 0; j < chunk_size; j++) {
|
||||
for (uint32_t j = 0; j < chunk_size; j++) {
|
||||
execute_step_one<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
|
||||
|
||||
@@ -267,7 +267,7 @@ __host__ void execute_tbc_external_product_loop(
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_chunk_size, int lwe_offset) {
|
||||
uint32_t lwe_chunk_size, uint32_t lwe_offset) {
|
||||
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
@@ -294,6 +294,8 @@ __host__ void execute_tbc_external_product_loop(
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
if (chunk_size == 0)
|
||||
return;
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_tbc;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
|
||||
@@ -31,6 +31,13 @@ __device__ void copy_polynomial(const T *__restrict__ source, T *dst) {
|
||||
tid = tid + block_size;
|
||||
}
|
||||
}
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void copy_polynomial_in_regs(const T *__restrict__ source, T *dst) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
dst[i] = source[threadIdx.x + i * block_size];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives num_poly concatenated polynomials of type T. For each:
|
||||
@@ -215,6 +222,8 @@ __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
|
||||
Torus result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
// params::degree - tid - 1 can't be negative, tid goes from 0 to
|
||||
// params::degree - 1
|
||||
auto x = glwe_slice[params::degree - tid - 1];
|
||||
result[i] = SEL(-x, x, tid >= params::degree - nth);
|
||||
tid = tid + params::degree / params::opt;
|
||||
|
||||
@@ -55,21 +55,22 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
}
|
||||
}
|
||||
|
||||
// If init_accumulator is set, assumes that result was not initialized and does
|
||||
// that with the outcome of first * second
|
||||
template <typename T, class params>
|
||||
__device__ void
|
||||
polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
|
||||
uint64_t monomial_degree,
|
||||
bool init_accumulator = false) {
|
||||
// monomial_degree \in [0, 2 * params::degree)
|
||||
int full_cycles_count = monomial_degree / params::degree;
|
||||
int remainder_degrees = monomial_degree % params::degree;
|
||||
// This method expects to work with polynomial_size / compression_params::opt
|
||||
// threads in the x-block If init_accumulator is set, assumes that result was
|
||||
// not initialized and does that with the outcome of first * second
|
||||
template <typename T>
|
||||
__device__ void polynomial_accumulate_monic_monomial_mul(
|
||||
T *result, const T *__restrict__ poly, uint64_t monomial_degree,
|
||||
uint32_t tid, uint32_t polynomial_size, int coeff_per_thread,
|
||||
bool init_accumulator = false) {
|
||||
// monomial_degree \in [0, 2 * compression_params::degree)
|
||||
int full_cycles_count = monomial_degree / polynomial_size;
|
||||
int remainder_degrees = monomial_degree % polynomial_size;
|
||||
|
||||
int pos = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
int pos = tid;
|
||||
for (int i = 0; i < coeff_per_thread; i++) {
|
||||
T element = poly[pos];
|
||||
int new_pos = (pos + monomial_degree) % params::degree;
|
||||
int new_pos = (pos + monomial_degree) % polynomial_size;
|
||||
|
||||
T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
|
||||
x = SEL(-x, x, new_pos >= remainder_degrees);
|
||||
@@ -78,7 +79,32 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
|
||||
result[new_pos] = x;
|
||||
else
|
||||
result[new_pos] += x;
|
||||
pos += params::degree / params::opt;
|
||||
pos += polynomial_size / coeff_per_thread;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, class params>
|
||||
__device__ void polynomial_product_accumulate_by_monomial_nosync(
|
||||
T *result, const T *__restrict__ poly, uint32_t monomial_degree) {
|
||||
// monomial_degree \in [0, 2 * params::degree)
|
||||
int full_cycles_count = monomial_degree / params::degree;
|
||||
int remainder_degrees = monomial_degree % params::degree;
|
||||
|
||||
// Every thread has a fixed position to track instead of "chasing" the
|
||||
// position
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
int pos =
|
||||
(threadIdx.x + i * (params::degree / params::opt) - monomial_degree) &
|
||||
(params::degree - 1);
|
||||
|
||||
T element = poly[pos];
|
||||
T x = SEL(element, -element, full_cycles_count % 2);
|
||||
x = SEL(-x, x,
|
||||
threadIdx.x + i * (params::degree / params::opt) >=
|
||||
remainder_degrees);
|
||||
|
||||
result[i] += x;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
std::mutex m;
|
||||
bool p2p_enabled = false;
|
||||
|
||||
int cuda_setup_multi_gpu() {
|
||||
int32_t cuda_setup_multi_gpu() {
|
||||
int num_gpus = cuda_get_number_of_gpus();
|
||||
if (num_gpus == 0)
|
||||
PANIC("GPU error: the number of GPUs should be > 0.")
|
||||
@@ -32,7 +32,7 @@ int cuda_setup_multi_gpu() {
|
||||
}
|
||||
m.unlock();
|
||||
}
|
||||
return num_used_gpus;
|
||||
return (int32_t)(num_used_gpus);
|
||||
}
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count) {
|
||||
|
||||
@@ -176,22 +176,22 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
}
|
||||
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
|
||||
input_lwe_ciphertext_count, true);
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
|
||||
grouping_factor, input_lwe_ciphertext_count, true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
|
||||
pbs_level, input_lwe_ciphertext_count);
|
||||
cuda_synchronize_stream(stream);
|
||||
stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(stream, gpu_index, &buffer);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -255,23 +255,24 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
}
|
||||
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
stream, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer, glwe_dimension,
|
||||
polynomial_size, pbs_level, input_lwe_ciphertext_count, true);
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, (uint64_t *)d_lwe_ct_out_array,
|
||||
stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
|
||||
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
|
||||
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
|
||||
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
input_lwe_ciphertext_count);
|
||||
cuda_synchronize_stream(stream);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
cleanup_cuda_programmable_bootstrap(stream, &buffer);
|
||||
cleanup_cuda_programmable_bootstrap(stream, gpu_index, &buffer);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -20,7 +20,7 @@ def main(args):
|
||||
bench_function_id = bench_data["function_id"]
|
||||
|
||||
split = bench_function_id.split("::")
|
||||
if split.len() == 5: # Signed integers
|
||||
if len(split) == 5: # Signed integers
|
||||
(_, _, function_name, parameter_set, bits) = split
|
||||
else: # Unsigned integers
|
||||
(_, function_name, parameter_set, bits) = split
|
||||
@@ -53,7 +53,8 @@ def main(args):
|
||||
estimate_upper_bound_ms,
|
||||
)
|
||||
)
|
||||
except:
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
|
||||
if len(data) == 0:
|
||||
|
||||
@@ -58,7 +58,7 @@ flavor_name = "n3-A100x8-NVLink"
|
||||
[backend.hyperstack.multi-gpu-test]
|
||||
environment_name = "canada"
|
||||
image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
|
||||
flavor_name = "n3-A100x4"
|
||||
flavor_name = "n3-RTX-A6000x4"
|
||||
|
||||
[command.signed_integer_full_bench]
|
||||
workflow = "signed_integer_full_benchmark.yml"
|
||||
|
||||
@@ -133,8 +133,8 @@ if [[ "${backend}" == "gpu" ]]; then
|
||||
test_threads=8
|
||||
doctest_threads=8
|
||||
else
|
||||
test_threads=3
|
||||
doctest_threads=3
|
||||
test_threads=1
|
||||
doctest_threads=1
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-zk-pok"
|
||||
version = "0.3.0-alpha.0"
|
||||
version = "0.3.0-alpha.1"
|
||||
edition = "2021"
|
||||
keywords = ["zero", "knowledge", "proof", "vector-commitments"]
|
||||
homepage = "https://zama.ai/"
|
||||
@@ -15,13 +15,17 @@ description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryp
|
||||
ark-bls12-381 = { package = "tfhe-ark-bls12-381", version = "0.4.0" }
|
||||
ark-ec = { package = "tfhe-ark-ec", version = "0.4.2", features = ["parallel"] }
|
||||
ark-ff = { package = "tfhe-ark-ff", version = "0.4.3", features = ["parallel"] }
|
||||
ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = ["parallel"] }
|
||||
ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = [
|
||||
"parallel",
|
||||
] }
|
||||
ark-serialize = { version = "0.4.2" }
|
||||
rand = "0.8.5"
|
||||
rayon = "1.8.0"
|
||||
sha3 = "0.10.8"
|
||||
serde = { version = "~1.0", features = ["derive"] }
|
||||
zeroize = "1.7.0"
|
||||
num-bigint = "0.4.5"
|
||||
|
||||
[dev-dependencies]
|
||||
serde_json = "~1.0"
|
||||
itertools = "0.11.0"
|
||||
|
||||
@@ -210,9 +210,14 @@ impl CurveGroupOps<bls12_381::Zp> for bls12_381::G1 {
|
||||
}
|
||||
|
||||
fn mul_scalar(self, scalar: bls12_381::Zp) -> Self {
|
||||
self.mul_scalar(scalar)
|
||||
if scalar.inner == MontFp!("2") {
|
||||
self.double()
|
||||
} else {
|
||||
self.mul_scalar(scalar)
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_381::Zp]) -> Self {
|
||||
Self::Affine::multi_mul_scalar(bases, scalars)
|
||||
}
|
||||
@@ -245,9 +250,14 @@ impl CurveGroupOps<bls12_381::Zp> for bls12_381::G2 {
|
||||
}
|
||||
|
||||
fn mul_scalar(self, scalar: bls12_381::Zp) -> Self {
|
||||
self.mul_scalar(scalar)
|
||||
if scalar.inner == MontFp!("2") {
|
||||
self.double()
|
||||
} else {
|
||||
self.mul_scalar(scalar)
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_381::Zp]) -> Self {
|
||||
Self::Affine::multi_mul_scalar(bases, scalars)
|
||||
}
|
||||
@@ -273,6 +283,9 @@ impl PairingGroupOps<bls12_381::Zp, bls12_381::G1, bls12_381::G2> for bls12_381:
|
||||
}
|
||||
|
||||
fn pairing(x: bls12_381::G1, y: bls12_381::G2) -> Self {
|
||||
if x == bls12_381::G1::ZERO || y == bls12_381::G2::ZERO {
|
||||
return Self::pairing(bls12_381::G1::ZERO, bls12_381::G2::GENERATOR);
|
||||
}
|
||||
Self::pairing(x, y)
|
||||
}
|
||||
}
|
||||
@@ -329,12 +342,21 @@ impl CurveGroupOps<bls12_446::Zp> for bls12_446::G1 {
|
||||
}
|
||||
|
||||
fn mul_scalar(self, scalar: bls12_446::Zp) -> Self {
|
||||
self.mul_scalar(scalar)
|
||||
if scalar.inner == MontFp!("2") {
|
||||
self.double()
|
||||
} else {
|
||||
self.mul_scalar(scalar)
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_446::Zp]) -> Self {
|
||||
msm::msm_wnaf_g1_446(bases, scalars)
|
||||
// Self::Affine::multi_mul_scalar(bases, scalars)
|
||||
// overhead seems to not be worth it outside of wasm
|
||||
if cfg!(target_family = "wasm") {
|
||||
msm::msm_wnaf_g1_446(bases, scalars)
|
||||
} else {
|
||||
Self::Affine::multi_mul_scalar(bases, scalars)
|
||||
}
|
||||
}
|
||||
|
||||
fn to_bytes(self) -> impl AsRef<[u8]> {
|
||||
@@ -365,9 +387,14 @@ impl CurveGroupOps<bls12_446::Zp> for bls12_446::G2 {
|
||||
}
|
||||
|
||||
fn mul_scalar(self, scalar: bls12_446::Zp) -> Self {
|
||||
self.mul_scalar(scalar)
|
||||
if scalar.inner == MontFp!("2") {
|
||||
self.double()
|
||||
} else {
|
||||
self.mul_scalar(scalar)
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_446::Zp]) -> Self {
|
||||
Self::Affine::multi_mul_scalar(bases, scalars)
|
||||
}
|
||||
@@ -393,13 +420,16 @@ impl PairingGroupOps<bls12_446::Zp, bls12_446::G1, bls12_446::G2> for bls12_446:
|
||||
}
|
||||
|
||||
fn pairing(x: bls12_446::G1, y: bls12_446::G2) -> Self {
|
||||
if x == bls12_446::G1::ZERO || y == bls12_446::G2::ZERO {
|
||||
return Self::pairing(bls12_446::G1::ZERO, bls12_446::G2::GENERATOR);
|
||||
}
|
||||
Self::pairing(x, y)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, serde::Serialize, serde::Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct Bls12_381;
|
||||
#[derive(Copy, Clone, serde::Serialize, serde::Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct Bls12_446;
|
||||
|
||||
impl Curve for Bls12_381 {
|
||||
|
||||
@@ -55,6 +55,7 @@ mod g1 {
|
||||
}
|
||||
|
||||
impl G1Affine {
|
||||
#[track_caller]
|
||||
pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> G1 {
|
||||
// SAFETY: interpreting a `repr(transparent)` pointer as its contents.
|
||||
G1 {
|
||||
@@ -124,6 +125,7 @@ mod g1 {
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> Self {
|
||||
use rayon::prelude::*;
|
||||
let bases = bases
|
||||
@@ -230,6 +232,7 @@ mod g2 {
|
||||
}
|
||||
|
||||
impl G2Affine {
|
||||
#[track_caller]
|
||||
pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> G2 {
|
||||
// SAFETY: interpreting a `repr(transparent)` pointer as its contents.
|
||||
G2 {
|
||||
@@ -247,10 +250,10 @@ mod g2 {
|
||||
// functions. we cache it since it requires a Zp division
|
||||
// https://hackmd.io/@tazAymRSQCGXTUKkbh1BAg/Sk27liTW9#Math-Formula-for-Point-Addition
|
||||
pub(crate) fn compute_m(self, other: G2Affine) -> Option<crate::curve_446::Fq2> {
|
||||
let zero = crate::curve_446::Fq2::ZERO;
|
||||
|
||||
// in the context of elliptic curves, the point at infinity is the zero element of the
|
||||
// group
|
||||
let zero = crate::curve_446::Fq2::ZERO;
|
||||
|
||||
if self.inner.infinity || other.inner.infinity {
|
||||
return None;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use ark_ec::short_weierstrass::Affine;
|
||||
use ark_ec::AffineRepr;
|
||||
use ark_ff::{AdditiveGroup, BigInt, BigInteger, Field, Fp, PrimeField};
|
||||
use ark_ff::{AdditiveGroup, BigInteger, Field, Fp, PrimeField};
|
||||
use rayon::prelude::*;
|
||||
|
||||
fn make_digits(a: &impl BigInteger, w: usize, num_bits: usize) -> impl Iterator<Item = i64> + '_ {
|
||||
@@ -46,6 +46,7 @@ fn make_digits(a: &impl BigInteger, w: usize, num_bits: usize) -> impl Iterator<
|
||||
}
|
||||
|
||||
// Compute msm using windowed non-adjacent form
|
||||
#[track_caller]
|
||||
pub fn msm_wnaf_g1_446(
|
||||
bases: &[super::bls12_446::G1Affine],
|
||||
scalars: &[super::bls12_446::Zp],
|
||||
@@ -236,207 +237,3 @@ pub fn msm_wnaf_g1_446(
|
||||
total
|
||||
})
|
||||
}
|
||||
|
||||
// Compute msm using windowed non-adjacent form
|
||||
pub fn msm_wnaf_g1_446_extended(
|
||||
bases: &[super::bls12_446::G1Affine],
|
||||
scalars: &[super::bls12_446::Zp],
|
||||
) -> super::bls12_446::G1 {
|
||||
use super::bls12_446::*;
|
||||
type BaseField = Fp<ark_ff::MontBackend<crate::curve_446::FqConfig, 7>, 7>;
|
||||
|
||||
// let num_bits = 75usize;
|
||||
// let mask = BigInt([!0, (1 << 11) - 1, 0, 0, 0]);
|
||||
// let scalars = &*scalars
|
||||
// .par_iter()
|
||||
// .map(|x| x.inner.into_bigint())
|
||||
// .flat_map_iter(|x| (0..4).map(move |i| (x >> (75 * i)) & mask))
|
||||
// .collect::<Vec<_>>();
|
||||
|
||||
let num_bits = 150usize;
|
||||
let mask = BigInt([!0, !0, (1 << 22) - 1, 0, 0]);
|
||||
let scalars = &*scalars
|
||||
.par_iter()
|
||||
.map(|x| x.inner.into_bigint())
|
||||
.flat_map_iter(|x| (0..2).map(move |i| (x >> (150 * i)) & mask))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(bases.len(), scalars.len());
|
||||
|
||||
let size = bases.len();
|
||||
|
||||
let c = if size < 32 {
|
||||
3
|
||||
} else {
|
||||
// natural log approx
|
||||
(size.ilog2() as usize * 69 / 100) + 2
|
||||
};
|
||||
let c = c - 3;
|
||||
|
||||
let digits_count = (num_bits + c - 1) / c;
|
||||
let scalar_digits = scalars
|
||||
.into_par_iter()
|
||||
.flat_map_iter(|s| make_digits(s, c, num_bits))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let zero = G1Affine {
|
||||
inner: Affine::zero(),
|
||||
};
|
||||
|
||||
let window_sums: Vec<_> = (0..digits_count)
|
||||
.into_par_iter()
|
||||
.map(|i| {
|
||||
let n = 1 << c;
|
||||
let mut indices = vec![vec![]; n];
|
||||
let mut d = vec![BaseField::ZERO; n + 1];
|
||||
let mut e = vec![BaseField::ZERO; n + 1];
|
||||
|
||||
for (idx, digits) in scalar_digits.chunks(digits_count).enumerate() {
|
||||
use core::cmp::Ordering;
|
||||
// digits is the digits thing of the first scalar?
|
||||
let scalar = digits[i];
|
||||
match 0.cmp(&scalar) {
|
||||
Ordering::Less => indices[(scalar - 1) as usize].push(idx),
|
||||
Ordering::Greater => indices[(-scalar - 1) as usize].push(!idx),
|
||||
Ordering::Equal => (),
|
||||
}
|
||||
}
|
||||
|
||||
let mut buckets = vec![zero; 1 << c];
|
||||
|
||||
loop {
|
||||
d[0] = BaseField::ONE;
|
||||
for (k, (bucket, idx)) in core::iter::zip(&mut buckets, &mut indices).enumerate() {
|
||||
if let Some(idx) = idx.last().copied() {
|
||||
let value = if idx >> (usize::BITS - 1) == 1 {
|
||||
let mut val = bases[!idx];
|
||||
val.inner.y = -val.inner.y;
|
||||
val
|
||||
} else {
|
||||
bases[idx]
|
||||
};
|
||||
|
||||
if !bucket.inner.infinity {
|
||||
let a = value.inner.x - bucket.inner.x;
|
||||
if a != BaseField::ZERO {
|
||||
d[k + 1] = d[k] * a;
|
||||
} else if value.inner.y == bucket.inner.y {
|
||||
d[k + 1] = d[k] * value.inner.y.double();
|
||||
} else {
|
||||
d[k + 1] = d[k];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
d[k + 1] = d[k];
|
||||
}
|
||||
e[n] = d[n].inverse().unwrap();
|
||||
|
||||
for (k, (bucket, idx)) in core::iter::zip(&mut buckets, &mut indices)
|
||||
.enumerate()
|
||||
.rev()
|
||||
{
|
||||
if let Some(idx) = idx.last().copied() {
|
||||
let value = if idx >> (usize::BITS - 1) == 1 {
|
||||
let mut val = bases[!idx];
|
||||
val.inner.y = -val.inner.y;
|
||||
val
|
||||
} else {
|
||||
bases[idx]
|
||||
};
|
||||
|
||||
if !bucket.inner.infinity {
|
||||
let a = value.inner.x - bucket.inner.x;
|
||||
if a != BaseField::ZERO {
|
||||
e[k] = e[k + 1] * a;
|
||||
} else if value.inner.y == bucket.inner.y {
|
||||
e[k] = e[k + 1] * value.inner.y.double();
|
||||
} else {
|
||||
e[k] = e[k + 1];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
e[k] = e[k + 1];
|
||||
}
|
||||
|
||||
let d = &d[..n];
|
||||
let e = &e[1..];
|
||||
|
||||
let mut empty = true;
|
||||
for ((&d, &e), (bucket, idx)) in core::iter::zip(
|
||||
core::iter::zip(d, e),
|
||||
core::iter::zip(&mut buckets, &mut indices),
|
||||
) {
|
||||
empty &= idx.len() <= 1;
|
||||
if let Some(idx) = idx.pop() {
|
||||
let value = if idx >> (usize::BITS - 1) == 1 {
|
||||
let mut val = bases[!idx];
|
||||
val.inner.y = -val.inner.y;
|
||||
val
|
||||
} else {
|
||||
bases[idx]
|
||||
};
|
||||
|
||||
if !bucket.inner.infinity {
|
||||
let x1 = bucket.inner.x;
|
||||
let x2 = value.inner.x;
|
||||
let y1 = bucket.inner.y;
|
||||
let y2 = value.inner.y;
|
||||
|
||||
let eq_x = x1 == x2;
|
||||
|
||||
if eq_x && y1 != y2 {
|
||||
bucket.inner.infinity = true;
|
||||
} else {
|
||||
let r = d * e;
|
||||
let m = if eq_x {
|
||||
let x1 = x1.square();
|
||||
x1 + x1.double()
|
||||
} else {
|
||||
y2 - y1
|
||||
};
|
||||
let m = m * r;
|
||||
|
||||
let x3 = m.square() - x1 - x2;
|
||||
let y3 = m * (x1 - x3) - y1;
|
||||
bucket.inner.x = x3;
|
||||
bucket.inner.y = y3;
|
||||
}
|
||||
} else {
|
||||
*bucket = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if empty {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let mut running_sum = G1::ZERO;
|
||||
let mut res = G1::ZERO;
|
||||
buckets.into_iter().rev().for_each(|b| {
|
||||
running_sum.inner += b.inner;
|
||||
res += running_sum;
|
||||
});
|
||||
res
|
||||
})
|
||||
.collect();
|
||||
|
||||
// We store the sum for the lowest window.
|
||||
let lowest = *window_sums.first().unwrap();
|
||||
|
||||
// We're traversing windows from high to low.
|
||||
lowest
|
||||
+ window_sums[1..]
|
||||
.iter()
|
||||
.rev()
|
||||
.fold(G1::ZERO, |mut total, &sum_i| {
|
||||
total += sum_i;
|
||||
for _ in 0..c {
|
||||
total = total.double();
|
||||
}
|
||||
total
|
||||
})
|
||||
}
|
||||
|
||||
308
tfhe-zk-pok/src/four_squares.rs
Normal file
308
tfhe-zk-pok/src/four_squares.rs
Normal file
@@ -0,0 +1,308 @@
|
||||
use ark_ff::biginteger::arithmetic::widening_mul;
|
||||
use rand::prelude::*;
|
||||
|
||||
pub fn sqr<T: Copy + core::ops::Mul>(x: T) -> T::Output {
|
||||
x * x
|
||||
}
|
||||
|
||||
// copied from the standard library
|
||||
// since isqrt is unstable at the moment
|
||||
pub fn isqrt(this: u128) -> u128 {
|
||||
if this < 2 {
|
||||
return this;
|
||||
}
|
||||
|
||||
// The algorithm is based on the one presented in
|
||||
// <https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_(base_2)>
|
||||
// which cites as source the following C code:
|
||||
// <https://web.archive.org/web/20120306040058/http://medialab.freaknet.org/martin/src/sqrt/sqrt.c>.
|
||||
|
||||
let mut op = this;
|
||||
let mut res = 0;
|
||||
let mut one = 1 << (this.ilog2() & !1);
|
||||
|
||||
while one != 0 {
|
||||
if op >= res + one {
|
||||
op -= res + one;
|
||||
res = (res >> 1) + one;
|
||||
} else {
|
||||
res >>= 1;
|
||||
}
|
||||
one >>= 2;
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn half_gcd(p: u128, s: u128) -> u128 {
|
||||
let sq_p = isqrt(p as _);
|
||||
let mut a = p;
|
||||
let mut b = s;
|
||||
while b > sq_p {
|
||||
let r = a % b;
|
||||
a = b;
|
||||
b = r;
|
||||
}
|
||||
b
|
||||
}
|
||||
|
||||
fn modular_inv_2_64(p: u64) -> u64 {
|
||||
assert_eq!(p % 2, 1);
|
||||
|
||||
let mut old_r = p as u128;
|
||||
let mut r = 1u128 << 64;
|
||||
|
||||
let mut old_s = 1u64;
|
||||
let mut s = 0u64;
|
||||
|
||||
while r != 0 {
|
||||
let q = old_r / r;
|
||||
(old_r, r) = (r, old_r - q * r);
|
||||
|
||||
let q = q as u64;
|
||||
(old_s, s) = (s, old_s.wrapping_sub(q.wrapping_mul(s)));
|
||||
}
|
||||
|
||||
assert_eq!(u64::wrapping_mul(old_s, p), 1);
|
||||
old_s
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
struct Montgomery {
|
||||
p: u128,
|
||||
r2: u128,
|
||||
p_prime: u64,
|
||||
}
|
||||
|
||||
impl Montgomery {
|
||||
fn new(p: u128) -> Self {
|
||||
assert_ne!(p, 0);
|
||||
assert_eq!(p % 2, 1);
|
||||
|
||||
// r = 2^128
|
||||
// we want to compute r^2 mod p
|
||||
let r = p.wrapping_neg() % p;
|
||||
|
||||
let r = num_bigint::BigUint::from(r);
|
||||
let r2 = &r * &r;
|
||||
let r2 = r2 % p;
|
||||
let r2_digits = &*r2.to_u64_digits();
|
||||
|
||||
let r2 = match *r2_digits {
|
||||
[] => 0u128,
|
||||
[a] => a as u128,
|
||||
[a, b] => a as u128 | ((b as u128) << 64),
|
||||
_ => unreachable!("value modulo 128 bit integer should have at most two u64 digits"),
|
||||
};
|
||||
|
||||
let p_prime = modular_inv_2_64(p as u64).wrapping_neg();
|
||||
|
||||
Self { p, r2, p_prime }
|
||||
}
|
||||
|
||||
fn redc(self, lo: u128, hi: u128) -> u128 {
|
||||
let p0 = self.p as u64;
|
||||
let p1 = (self.p >> 64) as u64;
|
||||
|
||||
let t0 = lo as u64;
|
||||
let mut t1 = (lo >> 64) as u64;
|
||||
let mut t2 = hi as u64;
|
||||
let mut t3 = (hi >> 64) as u64;
|
||||
let mut t4 = 0u64;
|
||||
|
||||
{
|
||||
let m = u64::wrapping_mul(t0, self.p_prime);
|
||||
let mut c = 0u64;
|
||||
|
||||
let x = c as u128 + t0 as u128 + widening_mul(m, p0);
|
||||
// t0 = x as u64;
|
||||
c = (x >> 64) as u64;
|
||||
|
||||
let x = c as u128 + t1 as u128 + widening_mul(m, p1);
|
||||
t1 = x as u64;
|
||||
c = (x >> 64) as u64;
|
||||
|
||||
let x = c as u128 + t2 as u128;
|
||||
t2 = x as u64;
|
||||
c = (x >> 64) as u64;
|
||||
|
||||
let x = c as u128 + t3 as u128;
|
||||
t3 = x as u64;
|
||||
c = (x >> 64) as u64;
|
||||
|
||||
t4 += c;
|
||||
}
|
||||
|
||||
{
|
||||
let m = u64::wrapping_mul(t1, self.p_prime);
|
||||
let mut c = 0u64;
|
||||
|
||||
let x = c as u128 + t1 as u128 + widening_mul(m, p0);
|
||||
// t1 = x as u64;
|
||||
c = (x >> 64) as u64;
|
||||
|
||||
let x = c as u128 + t2 as u128 + widening_mul(m, p1);
|
||||
t2 = x as u64;
|
||||
c = (x >> 64) as u64;
|
||||
|
||||
let x = c as u128 + t3 as u128;
|
||||
t3 = x as u64;
|
||||
c = (x >> 64) as u64;
|
||||
|
||||
t4 += c;
|
||||
}
|
||||
|
||||
let mut s0 = t2;
|
||||
let mut s1 = t3;
|
||||
let s2 = t4;
|
||||
|
||||
if !(s2 == 0 && (s1, s0) < (p1, p0)) {
|
||||
let borrow;
|
||||
(s0, borrow) = u64::overflowing_sub(s0, p0);
|
||||
s1 = s1.wrapping_sub(p1).wrapping_sub(borrow as u64);
|
||||
}
|
||||
|
||||
s0 as u128 | ((s1 as u128) << 64)
|
||||
}
|
||||
|
||||
fn mont_from_natural(self, x: u128) -> u128 {
|
||||
self.mul(x, self.r2)
|
||||
}
|
||||
|
||||
fn natural_from_mont(self, x: u128) -> u128 {
|
||||
self.redc(x, 0)
|
||||
}
|
||||
|
||||
fn mul(self, x: u128, y: u128) -> u128 {
|
||||
let x0 = x as u64;
|
||||
let x1 = (x >> 64) as u64;
|
||||
let y0 = y as u64;
|
||||
let y1 = (y >> 64) as u64;
|
||||
|
||||
let lolo = widening_mul(x0, y0);
|
||||
let lohi = widening_mul(x0, y1);
|
||||
let hilo = widening_mul(x1, y0);
|
||||
let hihi = widening_mul(x1, y1);
|
||||
|
||||
let lo = lolo;
|
||||
let (lo, o0) = u128::overflowing_add(lo, lohi << 64);
|
||||
let (lo, o1) = u128::overflowing_add(lo, hilo << 64);
|
||||
|
||||
let hi = hihi + (lohi >> 64) + (hilo >> 64) + (o0 as u128 + o1 as u128);
|
||||
|
||||
self.redc(lo, hi)
|
||||
}
|
||||
|
||||
fn exp(self, x: u128, n: u128) -> u128 {
|
||||
if n == 0 {
|
||||
return 1;
|
||||
}
|
||||
let mut y = self.mont_from_natural(1);
|
||||
let mut x = x;
|
||||
let mut n = n;
|
||||
while n > 1 {
|
||||
if n % 2 == 1 {
|
||||
y = self.mul(x, y);
|
||||
}
|
||||
x = self.mul(x, x);
|
||||
n /= 2;
|
||||
}
|
||||
self.mul(x, y)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn four_squares(v: u128) -> [u64; 4] {
|
||||
let rng = &mut StdRng::seed_from_u64(0);
|
||||
|
||||
let f = v % 4;
|
||||
if f == 2 {
|
||||
let b = isqrt(v as _) as u64;
|
||||
|
||||
'main_loop: loop {
|
||||
let x = 2 + rng.gen::<u64>() % (b - 2);
|
||||
let y = 2 + rng.gen::<u64>() % (b - 2);
|
||||
|
||||
let (sum, o) = u128::overflowing_add(sqr(x as u128), sqr(y as u128));
|
||||
if o || sum > v {
|
||||
continue 'main_loop;
|
||||
}
|
||||
|
||||
let p = v - sum;
|
||||
|
||||
if p == 0 || p == 1 {
|
||||
return [0, p as u64, x, y];
|
||||
}
|
||||
|
||||
if p % 4 != 1 {
|
||||
continue 'main_loop;
|
||||
}
|
||||
|
||||
let mut d = p - 1;
|
||||
let mut s = 0u32;
|
||||
while d % 2 == 0 {
|
||||
d /= 2;
|
||||
s += 1;
|
||||
}
|
||||
let d = d;
|
||||
let s = s;
|
||||
|
||||
let mont = Montgomery::new(p);
|
||||
let a = 2 + (rng.gen::<u128>() % (p - 3));
|
||||
|
||||
let mut sqrt = 0;
|
||||
{
|
||||
let a = mont.mont_from_natural(a);
|
||||
let one = mont.mont_from_natural(1);
|
||||
let neg_one = p - one;
|
||||
|
||||
let mut x = mont.exp(a, d);
|
||||
let mut y = 0;
|
||||
|
||||
for _ in 0..s {
|
||||
y = mont.mul(x, x);
|
||||
if y == one && x != one && x != neg_one {
|
||||
continue 'main_loop;
|
||||
}
|
||||
if y == neg_one {
|
||||
sqrt = x;
|
||||
}
|
||||
x = y;
|
||||
}
|
||||
if y != one {
|
||||
continue 'main_loop;
|
||||
}
|
||||
}
|
||||
if sqrt == 0 {
|
||||
continue 'main_loop;
|
||||
}
|
||||
|
||||
let i = mont.natural_from_mont(sqrt);
|
||||
let i = if i <= p / 2 { p - i } else { i };
|
||||
let z = half_gcd(p, i) as u64;
|
||||
let w = isqrt(p - sqr(z as u128)) as u64;
|
||||
|
||||
if p != sqr(z as u128) + sqr(w as u128) {
|
||||
continue 'main_loop;
|
||||
}
|
||||
|
||||
return [x, y, z, w];
|
||||
}
|
||||
} else if f == 0 {
|
||||
four_squares(v / 4).map(|x| x + x)
|
||||
} else {
|
||||
let mut r = four_squares(2 * v);
|
||||
r.sort_by_key(|&x| {
|
||||
if x % 2 == 0 {
|
||||
-1 - ((x / 2) as i64)
|
||||
} else {
|
||||
(x / 2) as i64
|
||||
}
|
||||
});
|
||||
[
|
||||
(r[0] + r[1]) / 2,
|
||||
(r[0] - r[1]) / 2,
|
||||
(r[3] + r[2]) / 2,
|
||||
(r[3] - r[2]) / 2,
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -3,3 +3,5 @@ pub use ark_serialize::{CanonicalDeserialize, CanonicalSerialize, Compress, Vali
|
||||
pub mod curve_446;
|
||||
pub mod curve_api;
|
||||
pub mod proofs;
|
||||
|
||||
mod four_squares;
|
||||
|
||||
@@ -141,5 +141,6 @@ pub const HASH_METADATA_LEN_BYTES: usize = 256;
|
||||
pub mod binary;
|
||||
pub mod index;
|
||||
pub mod pke;
|
||||
pub mod pke_v2;
|
||||
pub mod range;
|
||||
pub mod rlwe;
|
||||
|
||||
@@ -101,7 +101,7 @@ impl<G: Curve> PublicCommit<G> {
|
||||
b,
|
||||
c1,
|
||||
c2,
|
||||
__marker: Default::default(),
|
||||
__marker: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -194,6 +194,7 @@ pub fn commit<G: Curve>(
|
||||
pub fn prove<G: Curve>(
|
||||
public: (&PublicParams<G>, &PublicCommit<G>),
|
||||
private_commit: &PrivateCommit<G>,
|
||||
metadata: &[u8],
|
||||
load: ComputeLoad,
|
||||
rng: &mut dyn RngCore,
|
||||
) -> Proof<G> {
|
||||
@@ -347,7 +348,10 @@ pub fn prove<G: Curve>(
|
||||
.collect::<Box<_>>();
|
||||
|
||||
let mut y = vec![G::Zp::ZERO; n];
|
||||
G::Zp::hash(&mut y, &[hash, x_bytes, c_hat.to_bytes().as_ref()]);
|
||||
G::Zp::hash(
|
||||
&mut y,
|
||||
&[hash, metadata, x_bytes, c_hat.to_bytes().as_ref()],
|
||||
);
|
||||
let y = OneBased(y);
|
||||
|
||||
let scalars = (n + 1 - big_d..n + 1)
|
||||
@@ -360,6 +364,7 @@ pub fn prove<G: Curve>(
|
||||
&mut theta,
|
||||
&[
|
||||
hash_lmap,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -379,6 +384,7 @@ pub fn prove<G: Curve>(
|
||||
&mut t,
|
||||
&[
|
||||
hash_t,
|
||||
metadata,
|
||||
&(1..n + 1)
|
||||
.flat_map(|i| y[i].to_bytes().as_ref().to_vec())
|
||||
.collect::<Box<_>>(),
|
||||
@@ -394,6 +400,7 @@ pub fn prove<G: Curve>(
|
||||
&mut delta,
|
||||
&[
|
||||
hash_agg,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -472,6 +479,7 @@ pub fn prove<G: Curve>(
|
||||
core::array::from_mut(&mut z),
|
||||
&[
|
||||
hash_z,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -512,6 +520,7 @@ pub fn prove<G: Curve>(
|
||||
core::array::from_mut(&mut w),
|
||||
&[
|
||||
hash_w,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -698,6 +707,7 @@ fn compute_a_theta<G: Curve>(
|
||||
pub fn verify<G: Curve>(
|
||||
proof: &Proof<G>,
|
||||
public: (&PublicParams<G>, &PublicCommit<G>),
|
||||
metadata: &[u8],
|
||||
) -> Result<(), ()> {
|
||||
let &Proof {
|
||||
c_hat,
|
||||
@@ -760,7 +770,10 @@ pub fn verify<G: Curve>(
|
||||
.collect::<Box<_>>();
|
||||
|
||||
let mut y = vec![G::Zp::ZERO; n];
|
||||
G::Zp::hash(&mut y, &[hash, x_bytes, c_hat.to_bytes().as_ref()]);
|
||||
G::Zp::hash(
|
||||
&mut y,
|
||||
&[hash, metadata, x_bytes, c_hat.to_bytes().as_ref()],
|
||||
);
|
||||
let y = OneBased(y);
|
||||
|
||||
let mut theta = vec![G::Zp::ZERO; d + k + 1];
|
||||
@@ -768,6 +781,7 @@ pub fn verify<G: Curve>(
|
||||
&mut theta,
|
||||
&[
|
||||
hash_lmap,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -792,6 +806,7 @@ pub fn verify<G: Curve>(
|
||||
&mut t,
|
||||
&[
|
||||
hash_t,
|
||||
metadata,
|
||||
&(1..n + 1)
|
||||
.flat_map(|i| y[i].to_bytes().as_ref().to_vec())
|
||||
.collect::<Box<_>>(),
|
||||
@@ -807,6 +822,7 @@ pub fn verify<G: Curve>(
|
||||
&mut delta,
|
||||
&[
|
||||
hash_agg,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -821,6 +837,7 @@ pub fn verify<G: Curve>(
|
||||
core::array::from_mut(&mut z),
|
||||
&[
|
||||
hash_z,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -873,6 +890,7 @@ pub fn verify<G: Curve>(
|
||||
core::array::from_mut(&mut w),
|
||||
&[
|
||||
hash_w,
|
||||
metadata,
|
||||
x_bytes,
|
||||
c_hat.to_bytes().as_ref(),
|
||||
c_y.to_bytes().as_ref(),
|
||||
@@ -1053,6 +1071,15 @@ mod tests {
|
||||
.wrapping_add((delta * m[i] as u64) as i64);
|
||||
}
|
||||
|
||||
// One of our usecases uses 320 bits of additional metadata
|
||||
const METADATA_LEN: usize = (320 / u8::BITS) as usize;
|
||||
|
||||
let mut metadata = [0u8; METADATA_LEN];
|
||||
metadata.fill_with(|| rng.gen::<u8>());
|
||||
|
||||
let mut fake_metadata = [255u8; METADATA_LEN];
|
||||
fake_metadata.fill_with(|| rng.gen::<u8>());
|
||||
|
||||
let mut m_roundtrip = vec![0i64; k];
|
||||
for i in 0..k {
|
||||
let mut dot = 0i128;
|
||||
@@ -1093,60 +1120,77 @@ mod tests {
|
||||
let public_param_that_was_not_compressed =
|
||||
serialize_then_deserialize(&original_public_param, Compress::Yes).unwrap();
|
||||
|
||||
for public_param in [
|
||||
original_public_param,
|
||||
public_param_that_was_compressed,
|
||||
public_param_that_was_not_compressed,
|
||||
] {
|
||||
for use_fake_e1 in [false, true] {
|
||||
for use_fake_e2 in [false, true] {
|
||||
for use_fake_m in [false, true] {
|
||||
for use_fake_r in [false, true] {
|
||||
let (public_commit, private_commit) = commit(
|
||||
a.clone(),
|
||||
b.clone(),
|
||||
c1.clone(),
|
||||
c2.clone(),
|
||||
if use_fake_r {
|
||||
fake_r.clone()
|
||||
} else {
|
||||
r.clone()
|
||||
},
|
||||
if use_fake_e1 {
|
||||
fake_e1.clone()
|
||||
} else {
|
||||
e1.clone()
|
||||
},
|
||||
if use_fake_m {
|
||||
fake_m.clone()
|
||||
} else {
|
||||
m.clone()
|
||||
},
|
||||
if use_fake_e2 {
|
||||
fake_e2.clone()
|
||||
} else {
|
||||
e2.clone()
|
||||
},
|
||||
&public_param,
|
||||
rng,
|
||||
);
|
||||
for (
|
||||
public_param,
|
||||
use_fake_e1,
|
||||
use_fake_e2,
|
||||
use_fake_m,
|
||||
use_fake_r,
|
||||
use_fake_metadata_verify,
|
||||
) in itertools::iproduct!(
|
||||
[
|
||||
original_public_param,
|
||||
public_param_that_was_compressed,
|
||||
public_param_that_was_not_compressed,
|
||||
],
|
||||
[false, true],
|
||||
[false, true],
|
||||
[false, true],
|
||||
[false, true],
|
||||
[false, true]
|
||||
) {
|
||||
let (public_commit, private_commit) = commit(
|
||||
a.clone(),
|
||||
b.clone(),
|
||||
c1.clone(),
|
||||
c2.clone(),
|
||||
if use_fake_r {
|
||||
fake_r.clone()
|
||||
} else {
|
||||
r.clone()
|
||||
},
|
||||
if use_fake_e1 {
|
||||
fake_e1.clone()
|
||||
} else {
|
||||
e1.clone()
|
||||
},
|
||||
if use_fake_m {
|
||||
fake_m.clone()
|
||||
} else {
|
||||
m.clone()
|
||||
},
|
||||
if use_fake_e2 {
|
||||
fake_e2.clone()
|
||||
} else {
|
||||
e2.clone()
|
||||
},
|
||||
&public_param,
|
||||
rng,
|
||||
);
|
||||
|
||||
for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
|
||||
let proof = prove(
|
||||
(&public_param, &public_commit),
|
||||
&private_commit,
|
||||
load,
|
||||
rng,
|
||||
);
|
||||
for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
|
||||
let proof = prove(
|
||||
(&public_param, &public_commit),
|
||||
&private_commit,
|
||||
&metadata,
|
||||
load,
|
||||
rng,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
verify(&proof, (&public_param, &public_commit)).is_err(),
|
||||
use_fake_e1 || use_fake_e2 || use_fake_r || use_fake_m
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let verify_metadata = if use_fake_metadata_verify {
|
||||
&fake_metadata
|
||||
} else {
|
||||
&metadata
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
verify(&proof, (&public_param, &public_commit), verify_metadata).is_err(),
|
||||
use_fake_e1
|
||||
|| use_fake_e2
|
||||
|| use_fake_r
|
||||
|| use_fake_m
|
||||
|| use_fake_metadata_verify
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
2194
tfhe-zk-pok/src/proofs/pke_v2.rs
Normal file
2194
tfhe-zk-pok/src/proofs/pke_v2.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe"
|
||||
version = "0.8.0-alpha.2"
|
||||
version = "0.8.0-alpha.5"
|
||||
edition = "2021"
|
||||
readme = "../README.md"
|
||||
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
|
||||
@@ -62,12 +62,12 @@ lazy_static = { version = "1.4.0", optional = true }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
rayon = { version = "1.5.0" }
|
||||
bincode = "1.3.3"
|
||||
concrete-fft = { version = "0.4.1", features = ["serde", "fft128"] }
|
||||
concrete-ntt = { version = "0.1.2" }
|
||||
pulp = "0.18.8"
|
||||
concrete-fft = { version = "0.5.1", features = ["serde", "fft128"] }
|
||||
concrete-ntt = { version = "0.2.0" }
|
||||
pulp = "0.18.22"
|
||||
tfhe-cuda-backend = { version = "0.4.0-alpha.0", path = "../backends/tfhe-cuda-backend", optional = true }
|
||||
aligned-vec = { version = "0.5", features = ["serde"] }
|
||||
dyn-stack = { version = "0.9" }
|
||||
dyn-stack = { version = "0.10" }
|
||||
paste = "1.0.7"
|
||||
fs2 = { version = "0.4.3", optional = true }
|
||||
# Used for OPRF in shortint
|
||||
@@ -75,8 +75,8 @@ sha3 = { version = "0.10", optional = true }
|
||||
# While we wait for repeat_n in rust standard library
|
||||
itertools = "0.11.0"
|
||||
rand_core = { version = "0.6.4", features = ["std"] }
|
||||
tfhe-zk-pok = { version = "0.3.0-alpha.0", path = "../tfhe-zk-pok", optional = true }
|
||||
tfhe-versionable = { version = "0.2.0", path = "../utils/tfhe-versionable" }
|
||||
tfhe-zk-pok = { version = "0.3.0-alpha.1", path = "../tfhe-zk-pok", optional = true }
|
||||
tfhe-versionable = { version = "0.2.1", path = "../utils/tfhe-versionable" }
|
||||
|
||||
# wasm deps
|
||||
wasm-bindgen = { version = "0.2.86", features = [
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user