Compare commits

...

18 Commits

Author SHA1 Message Date
Guillermo Oyarzun
caf7fdae77 try new vec functions 2024-11-29 09:52:22 +01:00
Guillermo Oyarzun
abab60a6e8 add doublekeybundle 2024-11-26 17:32:05 +01:00
Guillermo Oyarzun
644bac8fd8 remove some syncs 2024-11-25 16:45:24 +01:00
Nicolas Sarlin
530b18063a fix: zk-pok bench workflow using an invalid argument 2024-11-25 14:58:42 +01:00
Nicolas Sarlin
c5caacf56e chore(zk): add a test for compute_crs_params 2024-11-25 14:34:08 +01:00
Nicolas Sarlin
68cfd1008a chore(zk): add a test of a proof with invalid noise in zk 2024-11-25 14:34:08 +01:00
Nicolas Sarlin
87dbfdcd5e fix(zk): recompute B according to k in proof and use squared bounds
This removes the need for sqrt operations
also fix a proof slack was too big in v2
2024-11-25 14:34:08 +01:00
Nicolas Sarlin
770ae22bb6 refactor(zk): place asserts in proof behind a condition 2024-11-25 14:34:08 +01:00
Nicolas Sarlin
1e19bae29a refactor(zk): factorize r1/r2 computation between proofs 2024-11-25 14:34:08 +01:00
Nicolas Sarlin
811ae3c551 refactor(zk): factorize q decoding between proofs 2024-11-25 14:34:08 +01:00
Agnes Leroy
832703a46a chore(ci): add erc20 tests 2024-11-25 13:23:48 +01:00
Guillermo Oyarzun
81e11a6d70 feat(gpu): improve full propagation in sum and sub 2024-11-25 13:23:37 +01:00
David Testé
100c3ae77a chore(ci): fix gpu multi values ops flavor parsing 2024-11-25 13:18:11 +01:00
dependabot[bot]
db61b0bb9b chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions
Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 3.0.16 to 3.0.17.
- [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases)
- [Commits](38608ef4fb...5d6ac37a4c)

---
updated-dependencies:
- dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-11-25 11:06:28 +01:00
dependabot[bot]
dc8091ad0f chore(deps): bump actions/upload-artifact from 3.1.2 to 4.4.3
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3.1.2 to 4.4.3.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v3.1.2...b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-11-25 11:06:19 +01:00
dependabot[bot]
3ccfb9616a chore(deps): bump zama-ai/slab-github-runner from 1.2.0 to 1.3.0
Bumps [zama-ai/slab-github-runner](https://github.com/zama-ai/slab-github-runner) from 1.2.0 to 1.3.0.
- [Release notes](https://github.com/zama-ai/slab-github-runner/releases)
- [Commits](https://github.com/zama-ai/slab-github-runner/compare/v1.2.0...98f0788261a7323d5d695a883e20df36591a92b7)

---
updated-dependencies:
- dependency-name: zama-ai/slab-github-runner
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-11-25 11:06:12 +01:00
dependabot[bot]
83dc9b9453 chore(deps): bump dtolnay/rust-toolchain
Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from 7b1c307e0dcbda6122208f10795a713336a9b35a to 315e265cd78dad1e1dcf3a5074f6d6c47029d5aa.
- [Release notes](https://github.com/dtolnay/rust-toolchain/releases)
- [Commits](7b1c307e0d...315e265cd7)

---
updated-dependencies:
- dependency-name: dtolnay/rust-toolchain
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-11-25 11:06:04 +01:00
dependabot[bot]
4fe72a15c0 chore(deps): bump rtCamp/action-slack-notify from 2.2.1 to 2.3.2
Bumps [rtCamp/action-slack-notify](https://github.com/rtcamp/action-slack-notify) from 2.2.1 to 2.3.2.
- [Release notes](https://github.com/rtcamp/action-slack-notify/releases)
- [Commits](https://github.com/rtcamp/action-slack-notify/compare/v2.2.1...c33737706dea87cd7784c687dadc9adf1be59990)

---
updated-dependencies:
- dependency-name: rtCamp/action-slack-notify
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-11-25 11:05:56 +01:00
106 changed files with 5753 additions and 1644 deletions

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -100,7 +100,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -132,7 +132,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -158,7 +158,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -269,7 +269,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -73,7 +73,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -138,7 +138,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -73,7 +73,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -142,7 +142,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -141,7 +141,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -167,7 +167,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -249,7 +249,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -119,7 +119,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -127,7 +127,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -58,7 +58,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -115,7 +115,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -124,7 +124,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -54,7 +54,7 @@ jobs:
echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -126,7 +126,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,7 +84,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -167,7 +167,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -54,7 +54,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,7 +111,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -196,7 +196,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -83,7 +83,7 @@ jobs:
- name: Set multiple operations flavors
if: ${{ contains(inputs.op_flavor, ',')}}
run: |
PARSED_OP_FLAVOR=$(echo "${{ inputs.op_flavor }}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
PARSED_OP_FLAVOR=$(echo "${{ inputs.op_flavor }}" | sed 's/[[:space:]]*,[[:space:]]*/", "/g')
echo "OP_FLAVOR=[\"${PARSED_OP_FLAVOR}\"]" >> "${GITHUB_ENV}"
- name: Set benchmark types
@@ -118,7 +118,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -181,7 +181,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -271,7 +271,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -91,7 +91,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -131,7 +131,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -198,7 +198,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -56,7 +56,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -163,7 +163,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -90,7 +90,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -191,7 +191,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -81,7 +81,7 @@ jobs:
--name-suffix avx512
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
with:
name: ${{ github.sha }}_fft
path: ${{ env.RESULTS_FILENAME }}
@@ -110,7 +110,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "tfhe-fft benchmarks failed. (${{ env.ACTION_RUN_URL }})"
@@ -123,7 +123,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (fft-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -81,7 +81,7 @@ jobs:
--name-suffix avx512
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
with:
name: ${{ github.sha }}_ntt
path: ${{ env.RESULTS_FILENAME }}
@@ -110,7 +110,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "tfhe-ntt benchmarks failed. (${{ env.ACTION_RUN_URL }})"
@@ -123,7 +123,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (ntt-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -58,7 +58,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +91,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -118,8 +118,7 @@ jobs:
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
--name-suffix avx512
- name: Upload parsed results artifact
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -156,7 +155,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -64,7 +64,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -98,7 +98,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -199,7 +199,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -73,7 +73,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -106,7 +106,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: nightly
@@ -181,7 +181,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -28,7 +28,7 @@ jobs:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable

View File

@@ -27,7 +27,7 @@ jobs:
make lint_workflow
- name: Ensure SHA pinned actions
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@38608ef4fb69adae7f1eac6eeb88e67b7d083bfd # v3.0.16
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@5d6ac37a4cef8b8df67f482a8e384987766f0213 # v3.0.17
with:
allowlist: |
slsa-framework/slsa-github-generator

View File

@@ -25,7 +25,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,7 +47,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -121,7 +121,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -75,7 +75,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable

View File

@@ -67,7 +67,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -120,7 +120,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -186,7 +186,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -65,7 +65,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -118,7 +118,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -184,7 +184,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -25,7 +25,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -76,7 +76,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -139,7 +139,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -67,7 +67,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -120,7 +120,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -189,7 +189,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -0,0 +1,146 @@
name: AWS Long Run Tests on GPU
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# Weekly tests will be triggered each Friday at 1a.m.
- cron: '0 1 * * FRI'
jobs:
setup-instance:
name: Setup instance (gpu-tests)
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-tests:
name: Long run GPU H100 tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run tests
run: |
make test_integer_long_run_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ needs.cuda-tests.result }}
SLACK_MESSAGE: "Integer GPU H100 long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (gpu-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (gpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -24,7 +24,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -110,7 +110,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -171,7 +171,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -171,7 +171,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -74,7 +74,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -128,7 +128,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -188,7 +188,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -171,7 +171,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -171,7 +171,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -74,7 +74,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -125,7 +125,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -185,7 +185,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -0,0 +1,94 @@
name: AWS Long Run Tests on CPU
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# Weekly tests will be triggered each Friday at 1a.m.
- cron: '0 1 * * FRI'
jobs:
setup-instance:
name: Setup instance (cpu-tests)
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-big
cpu-tests:
name: Long run CPU tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Run tests
run: |
make test_integer_long_run
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CPU long run tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cpu-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cpu-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -39,7 +39,7 @@ jobs:
persist-credentials: "false"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable

View File

@@ -36,7 +36,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -70,7 +70,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
with:
toolchain: stable
@@ -119,7 +119,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}

View File

@@ -39,7 +39,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}

View File

@@ -585,6 +585,11 @@ test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
.PHONY: test_integer_long_run_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_long_run_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,__long_run_tests -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run --test-threads=6
.PHONY: test_integer_compression
test_integer_compression: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -765,6 +770,12 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
--signed-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_integer_long_run # Run the long run tests for integer
test_integer_long_run: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,__long_run_tests -p $(TFHE_SPEC) -- integer::server_key::radix_parallel::tests_long_run
.PHONY: test_safe_serialization # Run the tests for safe serialization
test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \

View File

@@ -27,6 +27,15 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort(); \
}
cudaEvent_t cuda_create_event(uint32_t gpu_index);
void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
uint32_t gpu_index);
void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
uint32_t gpu_index);
void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index);
cudaStream_t cuda_create_stream(uint32_t gpu_index);
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);

View File

@@ -35,6 +35,8 @@ enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };
extern "C" {
void scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -282,23 +284,61 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory);
void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory);
void cuda_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks,
uint32_t requested_flag, uint32_t uses_carry);
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks);
void *lhs_array, const void *rhs_array, void *carry_out,
const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry);
void cleanup_cuda_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
bool allocate_gpu_memory);
void cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs_array, const void *rhs_array, void *overflow_block,
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
uint32_t uses_input_borrow);
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -318,25 +358,6 @@ void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
void cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks_in_radix);
void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_scalar_mul_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -376,26 +397,6 @@ void cleanup_cuda_integer_div_rem(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks_in_radix);
void cleanup_signed_overflowing_add_or_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,

View File

@@ -27,6 +27,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,

View File

@@ -28,7 +28,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
uint32_t num_many_lut, uint32_t lut_stride);
#endif
template <typename Torus>
@@ -46,7 +46,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
uint32_t num_many_lut, uint32_t lut_stride);
template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -63,7 +63,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
uint32_t num_many_lut, uint32_t lut_stride);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(

View File

@@ -255,7 +255,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template <typename Torus>
@@ -266,7 +266,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
#if (CUDA_ARCH >= 900)
@@ -278,7 +278,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template <typename Torus>

View File

@@ -69,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -78,7 +78,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);

View File

@@ -27,7 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,

View File

@@ -2,6 +2,30 @@
#include <cstdint>
#include <cuda_runtime.h>
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cudaEvent_t event;
check_cuda_error(cudaEventCreate(&event));
return event;
}
void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaEventRecord(event, stream));
}
void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
}
void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaEventDestroy(event));
}
/// Unsafe function to create a CUDA stream, must check first that GPU exists
cudaStream_t cuda_create_stream(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));

View File

@@ -305,4 +305,210 @@ __global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
}
}
template <class params>
__device__ void NSMFFT_direct2(double2 *A, double2 u[params::opt >> 1],
double2 v[params::opt >> 1]) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
//__syncthreads();
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
constexpr Index LOG2_DEGREE = params::log2_degree;
constexpr Index HALF_DEGREE = params::degree >> 1;
constexpr Index STRIDE = params::degree / params::opt;
Index tid = threadIdx.x;
// double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
double2 w;
// load into registers
// #pragma unroll
// for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
// u[i] = A[tid];
// v[i] = A[tid + HALF_DEGREE];
// tid += STRIDE;
// }
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = v[i] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
v[i] = u[i] - w;
u[i] = u[i] + w;
}
Index twiddle_shift = 1;
for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
tid = threadIdx.x;
// __syncthreads();
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
A[tid] = (u_stays_in_register) ? v[i] : u[i];
tid = tid + STRIDE;
}
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
w = A[tid ^ lane_mask];
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
w = negtwiddles[tid / lane_mask + twiddle_shift];
w *= v[i];
v[i] = u[i] - w;
u[i] = u[i] + w;
tid = tid + STRIDE;
}
__syncthreads();
}
//__syncthreads();
// store registers in SM
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
A[tid * 2] = u[i];
A[tid * 2 + 1] = v[i];
tid = tid + STRIDE;
}
__syncthreads();
}
template <class params>
__device__ void
NSMFFT_direct2_vec(double2 *A, double2 *B, double2 u[params::opt >> 1],
double2 v[params::opt >> 1], double2 u2[params::opt >> 1],
double2 v2[params::opt >> 1]) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
//__syncthreads();
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
constexpr Index LOG2_DEGREE = params::log2_degree;
constexpr Index HALF_DEGREE = params::degree >> 1;
constexpr Index STRIDE = params::degree / params::opt;
Index tid = threadIdx.x;
// double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
double2 w, w2;
// load into registers
// #pragma unroll
// for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
// u[i] = A[tid];
// v[i] = A[tid + HALF_DEGREE];
// tid += STRIDE;
// }
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = v[i] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
w2 = v2[i] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
v[i] = u[i] - w;
u[i] = u[i] + w;
v2[i] = u2[i] - w2;
u2[i] = u2[i] + w2;
}
Index twiddle_shift = 1;
for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
tid = threadIdx.x;
// __syncthreads();
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
A[tid] = (u_stays_in_register) ? v[i] : u[i];
B[tid] = (u_stays_in_register) ? v2[i] : u2[i];
tid = tid + STRIDE;
}
__syncthreads();
// if(l >= 5)
// __syncthreads();
// else
// __syncwarp();
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
w = A[tid ^ lane_mask];
w2 = B[tid ^ lane_mask];
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
u2[i] = (u_stays_in_register) ? u2[i] : w2;
v2[i] = (u_stays_in_register) ? w2 : v2[i];
w = negtwiddles[tid / lane_mask + twiddle_shift];
w2 = w * v2[i];
w *= v[i];
v[i] = u[i] - w;
u[i] = u[i] + w;
v2[i] = u2[i] - w2;
u2[i] = u2[i] + w2;
tid = tid + STRIDE;
}
__syncthreads();
// if(l >= 5)
// __syncthreads();
// else
// __syncwarp();
}
//__syncthreads();
// store registers in SM
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
A[tid * 2] = u[i];
A[tid * 2 + 1] = v[i];
B[tid * 2] = u2[i];
B[tid * 2 + 1] = v2[i];
tid = tid + STRIDE;
}
__syncthreads();
}
#endif // GPU_BOOTSTRAP_FFT_CUH

View File

@@ -58,9 +58,11 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, num_blocks);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
bsks, ksks, num_blocks, requested_flag, uses_carry);
host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
mem_ptr->bitxor_mem, bsks, ksks, num_blocks);

View File

@@ -1,50 +0,0 @@
#include "integer/addition.cuh"
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
: SIGNED_OPERATION::SUBTRACTION;
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_signed_overflowing_add_or_sub_memory<uint64_t> **)mem_ptr,
num_blocks, op, params, allocate_gpu_memory);
}
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks) {
auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
: SIGNED_OPERATION::SUBTRACTION;
host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lhs), static_cast<uint64_t const *>(rhs),
static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t *const *)(ksks),
mem, num_blocks);
}
void cleanup_signed_overflowing_add_or_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
(int_signed_overflowing_add_or_sub_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -1,149 +0,0 @@
#ifndef TFHE_RS_ADDITION_CUH
#define TFHE_RS_ADDITION_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "pbs/programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
template <typename Torus>
void host_resolve_signed_overflow(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation,
Torus const *last_block_input_carry, Torus *last_block_output_carry,
int_resolve_signed_overflow_memory<Torus> *mem, void *const *bsks,
Torus *const *ksks) {
auto x = mem->x;
Torus *d_clears =
(Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], d_clears, 2, 1);
// replace with host function call
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
mem->params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, x,
mem->params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, last_block_input_carry,
mem->params.big_lwe_dimension, 1);
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
last_block_inner_propagation,
mem->resolve_overflow_lut, ksks, bsks, 1);
cuda_drop_async(d_clears, streams[0], gpu_indexes[0]);
}
template <typename Torus>
__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_signed_overflowing_add_or_sub_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, op,
allocate_gpu_memory);
}
/*
* Addition - signed_operation = 1
* Subtraction - signed_operation = -1
*/
template <typename Torus>
__host__ void host_integer_signed_overflowing_add_or_sub_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed,
SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks,
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {
auto radix_params = mem_ptr->params;
uint32_t big_lwe_dimension = radix_params.big_lwe_dimension;
uint32_t big_lwe_size = big_lwe_dimension + 1;
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
auto result = mem_ptr->result;
auto neg_rhs = mem_ptr->neg_rhs;
auto input_carries = mem_ptr->input_carries;
auto output_carry = mem_ptr->output_carry;
auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
// phase 1
if (op == SIGNED_OPERATION::ADDITION) {
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
big_lwe_dimension, num_blocks);
} else {
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
big_lwe_dimension, num_blocks);
}
// phase 2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
host_propagate_single_carry<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
host_generate_last_block_inner_propagation<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
&rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
ksks);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
// phase 3
auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
if (op == SIGNED_OPERATION::SUBTRACTION && num_blocks == 1) {
// Quick fix for the case where the subtraction is done on a single block
Torus *one_scalar =
(Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], one_scalar, 1, 1);
create_trivial_radix<Torus>(
streams[0], gpu_indexes[0], input_carry, one_scalar, big_lwe_dimension,
1, 1, radix_params.message_modulus, radix_params.carry_modulus);
cuda_drop_async(one_scalar, streams[0], gpu_indexes[0]);
}
host_resolve_signed_overflow<Torus>(
streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
}
#endif // TFHE_RS_ADDITION_CUH

View File

@@ -295,7 +295,7 @@ __host__ void host_integer_decompress(
extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
// In the case of extracting a single LWE these parameters are dummy
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
@@ -311,7 +311,7 @@ __host__ void host_integer_decompress(
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
} else {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
@@ -338,7 +338,7 @@ __host__ void host_integer_decompress(
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(

View File

@@ -425,11 +425,24 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto do_overflowing_sub = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
host_integer_overflowing_sub_kb<Torus>(
streams, gpu_indexes, gpu_count, new_remainder.data,
subtraction_overflowed.data, merged_interesting_remainder.data,
interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
uint32_t compute_borrow = 1;
uint32_t uses_input_borrow = 0;
auto first_indexes = mem_ptr->first_indexes_for_overflow_sub
[merged_interesting_remainder.len - 1];
auto second_indexes = mem_ptr->second_indexes_for_overflow_sub
[merged_interesting_remainder.len - 1];
auto scalar_indexes =
mem_ptr
->scalars_for_overflow_sub[merged_interesting_remainder.len - 1];
mem_ptr->overflow_sub_mem->update_lut_indexes(
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
merged_interesting_remainder.len);
host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, new_remainder.data,
(uint64_t *)merged_interesting_remainder.data,
interesting_divisor.data, subtraction_overflowed.data,
(const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
merged_interesting_remainder.len, compute_borrow, uses_input_borrow);
};
// fills:
@@ -657,10 +670,12 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
gpu_count, int_mem_ptr->negated_quotient,
nullptr, nullptr, int_mem_ptr->scp_mem_1,
bsks, ksks, num_blocks);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
bsks, ksks, num_blocks, requested_flag, uses_carry);
host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
gpu_count, int_mem_ptr->negated_remainder,
@@ -671,7 +686,8 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
int_mem_ptr->negated_remainder, nullptr, nullptr,
int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks);
int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag,
uses_carry);
host_integer_radix_cmux_kb<Torus>(
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,

View File

@@ -1,4 +1,5 @@
#include "integer/integer.cuh"
#include "integer/negation.cuh"
#include <linear_algebra.h>
void cuda_full_propagation_64_inplace(void *const *streams,
@@ -49,7 +50,8 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -59,30 +61,94 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
requested_flag, uses_carry, allocate_gpu_memory);
}
void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
requested_flag, uses_carry, allocate_gpu_memory);
}
void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_overflowing_sub<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
compute_overflow, allocate_gpu_memory);
}
void cuda_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks,
uint32_t requested_flag, uint32_t uses_carry) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
nullptr, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), num_blocks);
static_cast<const uint64_t *>(carry_in),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks, requested_flag, uses_carry);
}
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks) {
host_propagate_single_carry<uint64_t>(
void *lhs_array, const void *rhs_array, void *carry_out,
const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry) {
host_add_and_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
static_cast<uint64_t *>(input_carries),
static_cast<uint64_t *>(lhs_array),
static_cast<const uint64_t *>(rhs_array),
static_cast<uint64_t *>(carry_out),
static_cast<const uint64_t *>(carry_in),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks);
num_blocks, requested_flag, uses_carry);
}
void cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs_array, const void *rhs_array, void *overflow_block,
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
uint32_t uses_input_borrow) {
host_integer_overflowing_sub<uint64_t>(
(cudaStream_t const *)streams, gpu_indexes, gpu_count,
static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
static_cast<const uint64_t *>(rhs_array),
static_cast<uint64_t *>(overflow_block),
static_cast<const uint64_t *>(input_borrow),
(int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
num_blocks, compute_overflow, uses_input_borrow);
}
void cleanup_cuda_propagate_single_carry(void *const *streams,
@@ -94,6 +160,23 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_borrow_prop_memory<uint64_t> *mem_ptr =
(int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -142,14 +225,14 @@ void cuda_apply_many_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_blocks,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
host_apply_many_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<const uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
lut_count, lut_stride);
num_many_lut, lut_stride);
}
void scratch_cuda_apply_bivariate_lut_kb_64(

File diff suppressed because it is too large Load Diff

View File

@@ -209,7 +209,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
auto small_lwe_size = small_lwe_dimension + 1;
// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
if (num_radix_in_vec == 0)
@@ -370,7 +370,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type, lut_count, lut_stride);
mem_ptr->params.pbs_type, num_many_lut, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -418,7 +418,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type, lut_count, lut_stride);
mem_ptr->params.pbs_type, num_many_lut, lut_stride);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
@@ -578,10 +578,15 @@ __host__ void host_integer_mult_radix_kb(
terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks, mem_ptr->luts_array);
auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
radix_lwe_out, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, num_blocks);
uint32_t block_modulus = message_modulus * carry_modulus;
uint32_t num_bits_in_block = std::log2(block_modulus);
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
}
template <typename Torus>

View File

@@ -12,49 +12,3 @@ void cuda_negate_integer_radix_ciphertext_64(
static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
}
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}
void cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
host_integer_overflowing_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<const uint64_t *>(radix_lwe_left),
static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
}
void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_overflowing_sub_memory<uint64_t> *mem_ptr =
(int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
*mem_ptr = new int_overflowing_sub_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
}
/*
template <typename Torus>
__host__ void host_integer_overflowing_sub_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -113,4 +113,39 @@ __host__ void host_integer_overflowing_sub_kb(
mem_ptr, bsks, ksks, num_blocks);
}
*/
template <typename Torus>
__host__ void host_integer_overflowing_sub(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
uint32_t uses_input_borrow) {
auto radix_params = mem_ptr->params;
// We need to recalculate the num_groups, because on the division the number
// of num_blocks changes
uint32_t block_modulus =
radix_params.message_modulus * radix_params.carry_modulus;
uint32_t num_bits_in_block = std::log2(block_modulus);
uint32_t grouping_size = num_bits_in_block;
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
auto stream = (cudaStream_t *)streams;
host_unchecked_sub_with_correcting_term<Torus>(
stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
radix_params.carry_modulus, radix_params.message_modulus - 1);
host_single_borrow_propagate<Torus>(
streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
static_cast<Torus *>(overflow_block),
static_cast<const Torus *>(input_borrow),
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
num_blocks, num_groups, compute_overflow, uses_input_borrow);
}
#endif

View File

@@ -112,10 +112,12 @@ __host__ void host_integer_scalar_mul_radix(
terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
num_radix_blocks, j, nullptr);
auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
nullptr, nullptr, scp_mem_ptr, bsks, ksks,
num_radix_blocks);
auto scp_mem_ptr = mem->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<T>(
streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry);
}
}

View File

@@ -57,6 +57,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
static_cast<const uint64_t *>(lwe_array_in_2),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u32 input LWE ciphertext vector with a u32
* plaintext vector. See the equivalent operation on u64 data for more details.

View File

@@ -82,6 +82,46 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
T const *input_2, uint32_t num_entries,
uint32_t message_modulus) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = input_1[index] * message_modulus + input_2[index];
}
}
template <typename T>
__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
uint32_t gpu_index, T *output,
T const *input_1, T const *input_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
&output[(input_lwe_ciphertext_count - 1) * lwe_size],
&input_1[(input_lwe_ciphertext_count - 1) * lwe_size],
&input_2[(input_lwe_ciphertext_count - 1) * lwe_size], lwe_size,
message_modulus);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void subtraction(T *output, T const *input_1, T const *input_2,
uint32_t num_entries) {

View File

@@ -92,7 +92,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
@@ -126,7 +126,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu,
lut_count, lut_stride);
num_many_lut, lut_stride);
}
break;
default:
@@ -165,7 +165,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
num_inputs_on_gpu, lut_count, lut_stride);
num_inputs_on_gpu, num_many_lut, lut_stride);
}
break;
case CLASSICAL:
@@ -194,7 +194,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu,
lut_count, lut_stride);
num_many_lut, lut_stride);
}
break;
default:

View File

@@ -45,7 +45,7 @@ __global__ void device_programmable_bootstrap_cg(
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block, uint32_t lut_count,
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
uint32_t lut_stride) {
grid_group grid = this_grid();
@@ -152,8 +152,8 @@ __global__ void device_programmable_bootstrap_cg(
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -168,8 +168,8 @@ __global__ void device_programmable_bootstrap_cg(
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
@@ -235,7 +235,7 @@ __host__ void host_programmable_bootstrap_cg(
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
@@ -273,7 +273,7 @@ __host__ void host_programmable_bootstrap_cg(
kernel_args[10] = &base_log;
kernel_args[11] = &level_count;
kernel_args[12] = &d_mem;
kernel_args[14] = &lut_count;
kernel_args[14] = &num_many_lut;
kernel_args[15] = &lut_stride;
if (max_shared_memory < partial_sm) {

View File

@@ -32,7 +32,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
@@ -134,8 +135,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// default
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -153,8 +154,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
@@ -293,7 +294,7 @@ __host__ void execute_cg_external_product_loop(
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
uint32_t lut_stride) {
uint64_t full_sm =
@@ -343,7 +344,7 @@ __host__ void execute_cg_external_product_loop(
kernel_args[16] = &chunk_size;
kernel_args[17] = &keybundle_size_per_input;
kernel_args[18] = &d_mem;
kernel_args[20] = &lut_count;
kernel_args[20] = &num_many_lut;
kernel_args[21] = &lut_stride;
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
@@ -379,7 +380,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -397,7 +398,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, lut_count,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
}
}

View File

@@ -123,7 +123,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
switch (polynomial_size) {
@@ -133,7 +133,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
@@ -141,7 +141,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
@@ -149,7 +149,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
@@ -157,7 +157,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
@@ -165,7 +165,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 8192:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
@@ -173,7 +173,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 16384:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
@@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -380,7 +380,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
switch (polynomial_size) {
@@ -390,7 +390,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap_cg<Torus, Degree<512>>(
@@ -398,7 +398,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap_cg<Torus, Degree<1024>>(
@@ -406,7 +406,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
@@ -414,7 +414,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
@@ -422,7 +422,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 8192:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
@@ -430,7 +430,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 16384:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
@@ -438,7 +438,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -455,7 +455,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
switch (polynomial_size) {
@@ -465,7 +465,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap<Torus, Degree<512>>(
@@ -473,7 +473,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap<Torus, Degree<1024>>(
@@ -481,7 +481,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -489,7 +489,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -497,7 +497,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 8192:
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -505,7 +505,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case 16384:
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -513,7 +513,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -531,7 +531,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 32)
PANIC("Cuda error (classical PBS): base log should be <= 32")
@@ -551,7 +551,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -566,7 +566,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -578,7 +578,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -653,7 +653,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
@@ -672,7 +672,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -687,7 +687,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -699,7 +699,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride);
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -727,7 +727,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -737,7 +737,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
@@ -758,7 +758,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -768,7 +768,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
@@ -797,7 +797,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -806,7 +806,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
void *stream, uint32_t gpu_index,

View File

@@ -142,7 +142,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -217,8 +217,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
@@ -233,8 +233,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
@@ -412,8 +412,8 @@ __host__ void execute_step_two(
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, uint32_t lut_count,
uint32_t lut_stride) {
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
uint32_t num_many_lut, uint32_t lut_stride) {
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
@@ -426,21 +426,21 @@ __host__ void execute_step_two(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm, lut_count, lut_stride);
level_count, d_mem, full_dm, num_many_lut, lut_stride);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm, lut_count, lut_stride);
level_count, d_mem, partial_dm, num_many_lut, lut_stride);
} else {
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0, lut_count, lut_stride);
level_count, d_mem, 0, num_many_lut, lut_stride);
}
check_cuda_error(cudaGetLastError());
}
@@ -456,7 +456,7 @@ __host__ void host_programmable_bootstrap(
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
cudaSetDevice(gpu_index);
// With SM each block corresponds to either the mask or body, no need to
@@ -493,7 +493,7 @@ __host__ void host_programmable_bootstrap(
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
lut_count, lut_stride);
num_many_lut, lut_stride);
}
}

View File

@@ -67,7 +67,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
@@ -76,7 +76,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -84,7 +84,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -92,7 +92,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -100,7 +100,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -108,7 +108,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 8192:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -116,7 +116,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 16384:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -124,7 +124,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -142,7 +142,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
@@ -151,7 +151,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -159,7 +159,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -167,7 +167,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -175,7 +175,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -183,7 +183,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 8192:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -191,7 +191,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 16384:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -199,7 +199,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -215,7 +215,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
if (base_log > 64)
@@ -236,7 +236,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
#else
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -251,7 +251,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -263,7 +263,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -499,7 +499,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
uint32_t num_many_lut, uint32_t lut_stride);
template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
void *stream, uint32_t gpu_index,
@@ -516,7 +516,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
uint32_t num_many_lut, uint32_t lut_stride);
template bool
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -588,7 +588,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 32)
PANIC("Cuda error (multi-bit PBS): base log should be <= 32")
@@ -600,7 +600,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -608,7 +608,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -616,7 +616,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 2048: {
int num_sms = 0;
@@ -629,14 +629,14 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_samples, lut_count, lut_stride);
level_count, num_samples, num_many_lut, lut_stride);
else
host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_samples, lut_count, lut_stride);
level_count, num_samples, num_many_lut, lut_stride);
break;
}
@@ -646,7 +646,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 8192:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -654,7 +654,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
case 16384:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -662,7 +662,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lut_count, lut_stride);
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -685,5 +685,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
uint32_t num_many_lut, uint32_t lut_stride);
#endif

View File

@@ -48,7 +48,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__shared__ uint32_t monomial_degrees[8];
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
@@ -59,6 +59,189 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
double2 *fft = (double2 *)selected_memory;
double2 *fft2 = fft + polynomial_size / 2;
// Ids
uint32_t level_id = blockIdx.z;
uint32_t glwe_id = blockIdx.y; // / (glwe_dimension + 1);
// uint32_t poly_id = 0; // blockIdx.y;// % (glwe_dimension + 1);
uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
uint32_t input_idx = blockIdx.x / lwe_chunk_size;
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
double2 *keybundle = keybundle_array +
// select the input
input_idx * keybundle_size_per_input;
////////////////////////////////////////////////////////////
// Computes all keybundles
uint32_t rev_lwe_iteration =
((lwe_dimension / grouping_factor) - lwe_iteration - 1);
if (threadIdx.x < (1 << grouping_factor)) {
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
synchronize_threads_in_block();
// ////////////////////////////////
// Keygen guarantees the first term is a constant term of the polynomial, no
// polynomial multiplication required
const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
const Torus *bsk_poly_ini = bsk_slice; // + poly_id * params::degree;
Torus reg_acc[params::opt];
Torus reg_acc2[params::opt];
// copy_polynomial_in_regs<Torus, params::opt, params::degree /
// params::opt>(
// bsk_poly_ini, reg_acc);
// copy_polynomial_in_regs<Torus, params::opt, params::degree /
// params::opt>(
// bsk_poly_ini + params::degree, reg_acc2);
copy_polynomial_in_regs_vec<Torus, params::opt,
params::degree / params::opt>(
bsk_poly_ini, reg_acc, bsk_poly_ini + params::degree, reg_acc2);
int offset =
get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
// Precalculate the monomial degrees and store them in shared memory
// uint32_t *monomial_degrees = (uint32_t *)selected_memory;
// if (threadIdx.x < (1 << grouping_factor)) {
// const Torus *lwe_array_group =
// block_lwe_array_in + rev_lwe_iteration * grouping_factor;
// monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus,
// params>(
// lwe_array_group, threadIdx.x, grouping_factor);
// }
// synchronize_threads_in_block();
// Accumulate the other terms
for (int g = 1; g < (1 << grouping_factor); g++) {
uint32_t monomial_degree = monomial_degrees[g];
const Torus *bsk_poly = bsk_poly_ini + g * offset;
const Torus *bsk_poly2 = bsk_poly_ini + g * offset + params::degree;
// Multiply by the bsk element
polynomial_product_accumulate_by_monomial_nosync_vec<Torus, params>(
reg_acc, reg_acc2, bsk_poly, bsk_poly2, monomial_degree);
}
// synchronize_threads_in_block(); // needed because we are going to reuse
// the shared memory for the fft
// double2 *fft = (double2 *)selected_memory;
// Move from local memory back to shared memory but as complex
// int tid = threadIdx.x;
// double2 *fft = (double2 *)selected_memory;
// #pragma unroll
// for (int i = 0; i < params::opt / 2; i++) {
// fft[tid] =
// make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
// (double)std::numeric_limits<Torus>::max(),
// __ll2double_rn((int64_t)reg_acc[i + params::opt /
// 2]) /
// (double)std::numeric_limits<Torus>::max());
// tid += params::degree / params::opt;
// }
double2 u[params::opt >> 2];
double2 v[params::opt >> 2];
double2 u2[params::opt >> 2];
double2 v2[params::opt >> 2];
for (int i = 0; i < params::opt / 4; i++) {
u[i] =
make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
(double)std::numeric_limits<Torus>::max(),
__ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) /
(double)std::numeric_limits<Torus>::max());
u2[i] =
make_double2(__ll2double_rn((int64_t)reg_acc2[i]) /
(double)std::numeric_limits<Torus>::max(),
__ll2double_rn((int64_t)reg_acc2[i + params::opt / 2]) /
(double)std::numeric_limits<Torus>::max());
v[i] = make_double2(
__ll2double_rn((int64_t)reg_acc[i + params::opt / 4]) /
(double)std::numeric_limits<Torus>::max(),
__ll2double_rn(
(int64_t)reg_acc[i + params::opt / 2 + params::opt / 4]) /
(double)std::numeric_limits<Torus>::max());
v2[i] = make_double2(
__ll2double_rn((int64_t)reg_acc2[i + params::opt / 4]) /
(double)std::numeric_limits<Torus>::max(),
__ll2double_rn(
(int64_t)reg_acc2[i + params::opt / 2 + params::opt / 4]) /
(double)std::numeric_limits<Torus>::max());
}
// for (int i = 0; i < params::opt / 4; i++) {
// v[i] = make_double2(
// __ll2double_rn((int64_t)reg_acc[i + params::opt / 4]) /
// (double)std::numeric_limits<Torus>::max(),
// __ll2double_rn(
// (int64_t)reg_acc[i + params::opt / 2 + params::opt / 4]) /
// (double)std::numeric_limits<Torus>::max());
// v2[i] = make_double2(
// __ll2double_rn((int64_t)reg_acc2[i + params::opt / 4]) /
// (double)std::numeric_limits<Torus>::max(),
// __ll2double_rn(
// (int64_t)reg_acc2[i + params::opt / 2 + params::opt / 4]) /
// (double)std::numeric_limits<Torus>::max());
// }
NSMFFT_direct2_vec<HalfDegree<params>>(fft, fft2, u, v, u2, v2);
// lwe iteration
auto keybundle_out = get_ith_mask_kth_block(
keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
polynomial_size, glwe_dimension, level_count);
// auto keybundle_poly = keybundle_out;// + poly_id * params::degree / 2;
copy_polynomial_vec<double2, params::opt / 2, params::degree / params::opt>(
fft, keybundle_out, fft2, keybundle_out + params::degree / 2);
// copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
// fft, keybundle_out);
// copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
// fft2, keybundle_out + params::degree / 2);
}
}
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_keybundle_bck(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__shared__ uint32_t monomial_degrees[8];
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
} else {
int block_index = blockIdx.x + blockIdx.y * gridDim.x +
blockIdx.z * gridDim.x * gridDim.y;
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
double2 *fft = (double2 *)selected_memory;
// Ids
uint32_t level_id = blockIdx.z;
@@ -98,7 +281,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
// Precalculate the monomial degrees and store them in shared memory
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
// uint32_t *monomial_degrees = (uint32_t *)selected_memory;
if (threadIdx.x < (1 << grouping_factor)) {
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
@@ -117,23 +301,43 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
polynomial_product_accumulate_by_monomial_nosync<Torus, params>(
reg_acc, bsk_poly, monomial_degree);
}
synchronize_threads_in_block(); // needed because we are going to reuse the
// shared memory for the fft
// synchronize_threads_in_block(); // needed because we are going to reuse
// the shared memory for the fft
// double2 *fft = (double2 *)selected_memory;
// Move from local memory back to shared memory but as complex
int tid = threadIdx.x;
double2 *fft = (double2 *)selected_memory;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] =
// int tid = threadIdx.x;
// double2 *fft = (double2 *)selected_memory;
// #pragma unroll
// for (int i = 0; i < params::opt / 2; i++) {
// fft[tid] =
// make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
// (double)std::numeric_limits<Torus>::max(),
// __ll2double_rn((int64_t)reg_acc[i + params::opt /
// 2]) /
// (double)std::numeric_limits<Torus>::max());
// tid += params::degree / params::opt;
// }
double2 u[params::opt >> 2];
double2 v[params::opt >> 2];
for (int i = 0; i < params::opt / 4; i++) {
u[i] =
make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
(double)std::numeric_limits<Torus>::max(),
__ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) /
(double)std::numeric_limits<Torus>::max());
tid += params::degree / params::opt;
}
NSMFFT_direct<HalfDegree<params>>(fft);
for (int i = 0; i < params::opt / 4; i++) {
v[i] = make_double2(
__ll2double_rn((int64_t)reg_acc[i + params::opt / 4]) /
(double)std::numeric_limits<Torus>::max(),
__ll2double_rn(
(int64_t)reg_acc[i + params::opt / 2 + params::opt / 4]) /
(double)std::numeric_limits<Torus>::max());
}
NSMFFT_direct2<HalfDegree<params>>(fft, u, v);
// lwe iteration
auto keybundle_out = get_ith_mask_kth_block(
@@ -253,7 +457,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
uint32_t lwe_chunk_size, int8_t *device_mem,
uint64_t device_memory_size_per_block, uint32_t lut_count,
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -326,8 +530,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.x * (glwe_dimension * polynomial_size + 1));
@@ -342,8 +546,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
@@ -363,7 +567,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator
return sizeof(double2) * polynomial_size; // / 2; // accumulator
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
@@ -513,8 +717,12 @@ __host__ void execute_compute_keybundle(
auto keybundle_fft = buffer->keybundle_fft;
// Compute a keybundle
dim3 grid_keybundle(num_samples * chunk_size,
(glwe_dimension + 1) * (glwe_dimension + 1), level_count);
// dim3 grid_keybundle(num_samples * chunk_size,
// (glwe_dimension + 1) * (glwe_dimension + 1),
// level_count);
dim3 grid_keybundle(num_samples * chunk_size, (glwe_dimension + 1),
level_count);
dim3 thds(polynomial_size / params::opt, 1, 1);
if (max_shared_memory < full_sm_keybundle)
@@ -591,12 +799,14 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
}
template <typename Torus, class params>
__host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
uint32_t j, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
__host__ void
execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, int32_t grouping_factor,
uint32_t level_count, uint32_t j, uint32_t lwe_offset,
uint32_t num_many_lut, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint64_t full_sm_accumulate_step_two =
@@ -621,7 +831,7 @@ __host__ void execute_step_two(
global_accumulator, global_accumulator_fft, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor, j,
lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two,
lut_count, lut_stride);
num_many_lut, lut_stride);
else
device_multi_bit_programmable_bootstrap_accumulate_step_two<Torus, params,
FULLSM>
@@ -630,7 +840,7 @@ __host__ void execute_step_two(
global_accumulator, global_accumulator_fft, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0,
lut_count, lut_stride);
num_many_lut, lut_stride);
check_cuda_error(cudaGetLastError());
}
@@ -643,7 +853,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -667,7 +877,8 @@ __host__ void host_multi_bit_programmable_bootstrap(
execute_step_two<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, j, lwe_offset, lut_count, lut_stride);
grouping_factor, level_count, j, lwe_offset, num_many_lut,
lut_stride);
}
}
}

View File

@@ -45,8 +45,8 @@ __global__ void device_programmable_bootstrap_tbc(
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block, bool support_dsm, uint32_t lut_count,
uint32_t lut_stride) {
uint64_t device_memory_size_per_block, bool support_dsm,
uint32_t num_many_lut, uint32_t lut_stride) {
cluster_group cluster = this_cluster();
@@ -158,8 +158,8 @@ __global__ void device_programmable_bootstrap_tbc(
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -175,8 +175,8 @@ __global__ void device_programmable_bootstrap_tbc(
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
@@ -261,7 +261,7 @@ __host__ void host_programmable_bootstrap_tbc(
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
auto supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
@@ -317,7 +317,7 @@ __host__ void host_programmable_bootstrap_tbc(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
supports_dsm, lut_count, lut_stride));
supports_dsm, num_many_lut, lut_stride));
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
@@ -326,7 +326,7 @@ __host__ void host_programmable_bootstrap_tbc(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
partial_dm, supports_dsm, lut_count, lut_stride));
partial_dm, supports_dsm, num_many_lut, lut_stride));
} else {
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
@@ -335,7 +335,7 @@ __host__ void host_programmable_bootstrap_tbc(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
supports_dsm, lut_count, lut_stride));
supports_dsm, num_many_lut, lut_stride));
}
}

View File

@@ -32,7 +32,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
bool support_dsm, uint32_t lut_count, uint32_t lut_stride) {
bool support_dsm, uint32_t num_many_lut, uint32_t lut_stride) {
cluster_group cluster = this_cluster();
@@ -141,8 +141,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// blocks, in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -157,8 +157,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
auto next_lwe_array_out =
lwe_array_out +
@@ -299,7 +299,7 @@ __host__ void execute_tbc_external_product_loop(
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -363,7 +363,7 @@ __host__ void execute_tbc_external_product_loop(
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, full_dm, supports_dsm, lut_count,
keybundle_size_per_input, d_mem, full_dm, supports_dsm, num_many_lut,
lut_stride));
} else if (max_shared_memory < full_dm + minimum_dm) {
config.dynamicSmemBytes = partial_dm + minimum_dm;
@@ -375,7 +375,7 @@ __host__ void execute_tbc_external_product_loop(
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, partial_dm, supports_dsm, lut_count,
keybundle_size_per_input, d_mem, partial_dm, supports_dsm, num_many_lut,
lut_stride));
} else {
config.dynamicSmemBytes = full_dm + minimum_dm;
@@ -387,7 +387,7 @@ __host__ void execute_tbc_external_product_loop(
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, 0, supports_dsm, lut_count,
keybundle_size_per_input, d_mem, 0, supports_dsm, num_many_lut,
lut_stride));
}
}
@@ -401,7 +401,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
uint32_t num_many_lut, uint32_t lut_stride) {
cudaSetDevice(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -419,7 +419,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, lut_count,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
}
}

View File

@@ -17,6 +17,18 @@ __device__ void copy_polynomial(const T *__restrict__ source, T *dst) {
tid = tid + block_size;
}
}
template <typename T, int elems_per_thread, int block_size>
__device__ void copy_polynomial_vec(const T *__restrict__ source, T *dst,
const T *__restrict__ source2, T *dst2) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < elems_per_thread; i++) {
dst[tid] = source[tid];
dst2[tid] = source2[tid];
tid = tid + block_size;
}
}
template <typename T, int elems_per_thread, int block_size>
__device__ void copy_polynomial_in_regs(const T *__restrict__ source, T *dst) {
#pragma unroll
@@ -25,6 +37,17 @@ __device__ void copy_polynomial_in_regs(const T *__restrict__ source, T *dst) {
}
}
template <typename T, int elems_per_thread, int block_size>
__device__ void
copy_polynomial_in_regs_vec(const T *__restrict__ source, T *dst,
const T *__restrict__ source2, T *dst2) {
#pragma unroll
for (int i = 0; i < elems_per_thread; i++) {
dst[i] = source[threadIdx.x + i * block_size];
dst2[i] = source2[threadIdx.x + i * block_size];
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each:
*

View File

@@ -130,4 +130,40 @@ __device__ void polynomial_product_accumulate_by_monomial_nosync(
}
}
template <typename T, class params>
__device__ void polynomial_product_accumulate_by_monomial_nosync_vec(
T *result, T *result2, const T *__restrict__ poly,
const T *__restrict__ poly2, uint32_t monomial_degree) {
// monomial_degree \in [0, 2 * params::degree)
int full_cycles_count = monomial_degree / params::degree;
int remainder_degrees = monomial_degree % params::degree;
// Every thread has a fixed position to track instead of "chasing" the
// position
#pragma unroll
for (int i = 0; i < params::opt; i++) {
int pos =
(threadIdx.x + i * (params::degree / params::opt) - monomial_degree) &
(params::degree - 1);
T element = poly[pos];
T element2 = poly2[pos];
T x = SEL(element, -element, full_cycles_count % 2);
T x2 = SEL(element2, -element2, full_cycles_count % 2);
bool condition =
threadIdx.x + i * (params::degree / params::opt) >= remainder_degrees;
x = SEL(-x, x, condition);
x2 = SEL(-x2, x2, condition);
// x = SEL(-x, x,
// threadIdx.x + i * (params::degree / params::opt) >=
// remainder_degrees);
// x2 = SEL(-x2, x2,
// threadIdx.x + i * (params::degree / params::opt) >=
// remainder_degrees);
result[i] += x;
result2[i] += x2;
}
}
#endif // CNCRT_POLYNOMIAL_MATH_H

View File

@@ -46,6 +46,24 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
}
}
/// Allocates the input/output vector for all devices
/// Initializes also the related indexing and initializes it to the trivial
/// index
template <typename Torus>
void multi_gpu_alloc_lwe_many_lut_output_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
uint32_t num_many_lut, uint32_t lwe_size) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
Torus *d_array = (Torus *)cuda_malloc_async(num_many_lut * inputs_on_gpu *
lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i]);
dest[i] = d_array;
}
}
/// Load an array residing on one GPU to all active gpus
/// and split the array among them.
/// The input indexing logic is given by an index array.
@@ -126,6 +144,49 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
}
}
/// Copy data from multiple GPUs back to GPU 0 following the indexing given in
/// dest_indexes
/// The input indexing should be the trivial one
template <typename Torus>
void multi_gpu_gather_many_lut_lwe_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *dest, const std::vector<Torus *> &src,
Torus *h_dest_indexes, bool is_trivial_index, uint32_t num_inputs,
uint32_t lwe_size, uint32_t num_many_lut) {
for (uint lut_id = 0; lut_id < num_many_lut; lut_id++) {
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
auto gpu_offset = 0;
for (uint j = 0; j < i; j++) {
gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
}
if (is_trivial_index) {
auto d_dest =
dest + gpu_offset * lwe_size + lut_id * num_inputs * lwe_size;
auto d_src = src[i] + lut_id * inputs_on_gpu * lwe_size;
cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
inputs_on_gpu * lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i]);
} else {
auto dest_indexes = h_dest_indexes + gpu_offset;
for (uint j = 0; j < inputs_on_gpu; j++) {
auto d_dest = dest + dest_indexes[j] * lwe_size +
lut_id * num_inputs * lwe_size;
auto d_src =
src[i] + j * lwe_size + lut_id * inputs_on_gpu * lwe_size;
cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i]);
}
}
}
}
}
template <typename Torus>
void multi_gpu_release_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,

View File

@@ -177,7 +177,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
@@ -186,7 +186,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
pbs_base_log, pbs_level, input_lwe_ciphertext_count, num_many_lut,
lut_stride);
cuda_synchronize_stream(stream, gpu_index);
}
@@ -208,7 +208,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
@@ -221,7 +221,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
(const uint64_t *)d_lwe_input_indexes, (const uint64_t *)d_bsk,
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, input_lwe_ciphertext_count, lut_count, lut_stride);
pbs_level, input_lwe_ciphertext_count, num_many_lut, lut_stride);
cuda_synchronize_stream(stream, gpu_index);
}
@@ -234,7 +234,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
@@ -243,7 +243,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
pbs_base_log, pbs_level, input_lwe_ciphertext_count, num_many_lut,
lut_stride);
cuda_synchronize_stream(stream, gpu_index);
}
@@ -265,7 +265,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
@@ -276,7 +276,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
input_lwe_ciphertext_count, lut_count, lut_stride);
input_lwe_ciphertext_count, num_many_lut, lut_stride);
cuda_synchronize_stream(stream, gpu_index);
}
@@ -297,7 +297,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
@@ -308,7 +308,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
input_lwe_ciphertext_count, lut_count, lut_stride);
input_lwe_ciphertext_count, num_many_lut, lut_stride);
cuda_synchronize_stream(stream, gpu_index);
}
@@ -322,7 +322,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
@@ -333,7 +333,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
input_lwe_ciphertext_count, lut_count, lut_stride);
input_lwe_ciphertext_count, num_many_lut, lut_stride);
cuda_synchronize_stream(stream, gpu_index);
}

View File

@@ -173,7 +173,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
@@ -192,7 +192,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, lut_count, lut_stride);
pbs_level, number_of_inputs, num_many_lut, lut_stride);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *

View File

@@ -119,7 +119,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
(glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
(1 << grouping_factor);
uint32_t lut_count = 1;
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (int r = 0; r < repetitions; r++) {
uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
@@ -137,7 +137,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, number_of_inputs, lut_count, lut_stride);
pbs_level, number_of_inputs, num_many_lut, lut_stride);
// Copy result to the host memory
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,

View File

@@ -721,6 +721,32 @@ extern "C" {
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
requested_flag: u32,
uses_carry: u32,
allocate_gpu_memory: bool,
);
}
extern "C" {
pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
requested_flag: u32,
uses_carry: u32,
allocate_gpu_memory: bool,
);
}
@@ -731,24 +757,30 @@ extern "C" {
gpu_count: u32,
lwe_array: *mut ffi::c_void,
carry_out: *mut ffi::c_void,
carry_in: *const ffi::c_void,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_blocks: u32,
requested_flag: u32,
uses_carry: u32,
);
}
extern "C" {
pub fn cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
pub fn cuda_add_and_propagate_single_carry_kb_64_inplace(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array: *mut ffi::c_void,
lhs_array: *mut ffi::c_void,
rhs_array: *const ffi::c_void,
carry_out: *mut ffi::c_void,
input_carries: *mut ffi::c_void,
carry_in: *const ffi::c_void,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_blocks: u32,
requested_flag: u32,
uses_carry: u32,
);
}
extern "C" {
@@ -759,6 +791,62 @@ extern "C" {
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn cleanup_cuda_add_and_propagate_single_carry(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
compute_overflow: u32,
allocate_gpu_memory: bool,
);
}
extern "C" {
pub fn cuda_integer_overflowing_sub_kb_64_inplace(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lhs_array: *mut ffi::c_void,
rhs_array: *const ffi::c_void,
overflow_block: *mut ffi::c_void,
input_borrow: *const ffi::c_void,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_blocks: u32,
compute_overflow: u32,
uses_input_borrow: u32,
);
}
extern "C" {
pub fn cleanup_cuda_integer_overflowing_sub(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
streams: *const *mut ffi::c_void,
@@ -803,51 +891,6 @@ extern "C" {
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
);
}
extern "C" {
pub fn cuda_integer_radix_overflowing_sub_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
radix_lwe_out: *mut ffi::c_void,
radix_lwe_overflowed: *mut ffi::c_void,
radix_lwe_left: *const ffi::c_void,
radix_lwe_right: *const ffi::c_void,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_blocks_in_radix: u32,
);
}
extern "C" {
pub fn cleanup_cuda_integer_radix_overflowing_sub(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_integer_scalar_mul_kb_64(
streams: *const *mut ffi::c_void,
@@ -942,52 +985,6 @@ extern "C" {
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
signed_operation: i8,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
);
}
extern "C" {
pub fn cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lhs: *mut ffi::c_void,
rhs: *const ffi::c_void,
overflowed: *mut ffi::c_void,
signed_operation: i8,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_blocks_in_radix: u32,
);
}
extern "C" {
pub fn cleanup_signed_overflowing_add_or_sub(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
streams: *const *mut ffi::c_void,
@@ -1384,7 +1381,7 @@ extern "C" {
base_log: u32,
level_count: u32,
num_samples: u32,
lut_count: u32,
num_many_lut: u32,
lut_stride: u32,
);
}
@@ -1406,7 +1403,7 @@ extern "C" {
base_log: u32,
level_count: u32,
num_samples: u32,
lut_count: u32,
num_many_lut: u32,
lut_stride: u32,
);
}
@@ -1469,7 +1466,7 @@ extern "C" {
base_log: u32,
level_count: u32,
num_samples: u32,
lut_count: u32,
num_many_lut: u32,
lut_stride: u32,
);
}

View File

@@ -1,12 +1,18 @@
// to follow the notation of the paper
#![allow(non_snake_case)]
pub mod pke;
pub mod pke_v2;
use std::convert::Infallible;
use std::error::Error;
use std::fmt::Display;
use tfhe_versionable::VersionsDispatch;
use tfhe_versionable::{Upgrade, Version, VersionsDispatch};
use crate::curve_api::Curve;
use crate::four_squares::{isqrt, sqr};
use crate::proofs::pke_v2::Bound;
use crate::proofs::GroupElements;
use crate::serialization::{
SerializableAffine, SerializableCubicExtField, SerializableFp, SerializableFp2,
@@ -65,6 +71,65 @@ pub(crate) enum SerializableGroupElementsVersions {
V0(SerializableGroupElements),
}
#[derive(Version)]
pub struct SerializablePKEv2PublicParamsV0 {
pub(crate) g_lists: SerializableGroupElements,
pub(crate) D: usize,
pub n: usize,
pub d: usize,
pub k: usize,
pub B: u64,
pub B_r: u64,
pub B_bound: u64,
pub m_bound: usize,
pub q: u64,
pub t: u64,
pub msbs_zero_padding_bit_count: u64,
// We use Vec<u8> since serde does not support fixed size arrays of 256 elements
pub(crate) hash: Vec<u8>,
pub(crate) hash_R: Vec<u8>,
pub(crate) hash_t: Vec<u8>,
pub(crate) hash_w: Vec<u8>,
pub(crate) hash_agg: Vec<u8>,
pub(crate) hash_lmap: Vec<u8>,
pub(crate) hash_phi: Vec<u8>,
pub(crate) hash_xi: Vec<u8>,
pub(crate) hash_z: Vec<u8>,
pub(crate) hash_chi: Vec<u8>,
}
impl Upgrade<SerializablePKEv2PublicParams> for SerializablePKEv2PublicParamsV0 {
type Error = Infallible;
fn upgrade(self) -> Result<SerializablePKEv2PublicParams, Self::Error> {
let slack_factor = isqrt((self.d + self.k) as u128) as u64;
let B_inf = self.B / slack_factor;
Ok(SerializablePKEv2PublicParams {
g_lists: self.g_lists,
D: self.D,
n: self.n,
d: self.d,
k: self.k,
B_bound_squared: sqr(self.B_bound as u128),
B_inf,
q: self.q,
t: self.t,
msbs_zero_padding_bit_count: self.msbs_zero_padding_bit_count,
bound_type: Bound::CS,
hash: self.hash,
hash_R: self.hash_R,
hash_t: self.hash_t,
hash_w: self.hash_w,
hash_agg: self.hash_agg,
hash_lmap: self.hash_lmap,
hash_phi: self.hash_phi,
hash_xi: self.hash_xi,
hash_z: self.hash_z,
hash_chi: self.hash_chi,
})
}
}
#[derive(VersionsDispatch)]
pub enum SerializablePKEv2PublicParamsVersions {
V0(SerializablePKEv2PublicParams),
@@ -74,3 +139,8 @@ pub enum SerializablePKEv2PublicParamsVersions {
pub enum SerializablePKEv1PublicParamsVersions {
V0(SerializablePKEv1PublicParams),
}
#[derive(VersionsDispatch)]
pub enum BoundVersions {
V0(Bound),
}

View File

@@ -214,6 +214,11 @@ impl Montgomery {
pub fn four_squares(v: u128) -> [u64; 4] {
let rng = &mut StdRng::seed_from_u64(0);
// In the extreme case where the noise is exactly at the bound, v is 0
if v == 0 {
return [0; 4];
}
let f = v % 4;
if f == 2 {
let b = isqrt(v as _) as u64;

View File

@@ -132,6 +132,130 @@ impl<G: Curve> GroupElements<G> {
}
}
/// Allows to compute proof with bad inputs for tests
#[derive(PartialEq, Eq)]
enum ProofSanityCheckMode {
Panic,
#[cfg(test)]
Ignore,
}
/// Check the preconditions of the pke proof before computing it. Panic if one of the conditions
/// does not hold.
#[allow(clippy::too_many_arguments)]
fn assert_pke_proof_preconditions(
c1: &[i64],
e1: &[i64],
c2: &[i64],
e2: &[i64],
d: usize,
k_max: usize,
big_d: usize,
big_d_max: usize,
) {
assert_eq!(c1.len(), d);
assert_eq!(e1.len(), d);
assert_eq!(c2.len(), e2.len());
assert!(c2.len() <= k_max);
assert!(big_d <= big_d_max);
}
/// q (modulus) is encoded on 64b, with 0 meaning 2^64. This converts the encoded q to its effective
/// value for modular operations.
fn decode_q(q: u64) -> u128 {
if q == 0 {
1u128 << 64
} else {
q as u128
}
}
/// Compute r1 according to eq (11):
///
/// rot(a) * phi(bar(r)) - q phi(r1) + phi(e1) = phi(c1)
/// implies
/// phi(r1) = (rot(a) * phi(bar(r)) + phi(e1) - phi(c1)) / q
/// (phi is the function that maps a polynomial to its coeffs vector)
fn compute_r1(
e1: &[i64],
c1: &[i64],
a: &[i64],
r: &[i64],
d: usize,
decoded_q: u128,
) -> Box<[i64]> {
let mut r1 = e1
.iter()
.zip(c1.iter())
.map(|(&e1, &c1)| e1 as i128 - c1 as i128)
.collect::<Box<[_]>>();
for i in 0..d {
for j in 0..d {
if i + j < d {
r1[i + j] += a[i] as i128 * r[d - j - 1] as i128;
} else {
r1[i + j - d] -= a[i] as i128 * r[d - j - 1] as i128;
}
}
}
{
for r1 in &mut *r1 {
*r1 /= decoded_q as i128;
}
}
r1.into_vec().into_iter().map(|r1| r1 as i64).collect()
}
/// Compute r2 according to eq (11):
///
/// phi_[d - i](b).T * phi(bar(r)) + delta * m_i - q r2_i + e2_i = c2_i
/// implies
/// r2_i = (phi_[d - i](b).T * phi(bar(r)) + delta * m_i + e2_i - c2_i) / q
/// (phi is the function that maps a polynomial to its coeffs vector)
#[allow(clippy::too_many_arguments)]
fn compute_r2(
e2: &[i64],
c2: &[i64],
m: &[i64],
b: &[i64],
r: &[i64],
d: usize,
delta: u64,
decoded_q: u128,
) -> Box<[i64]> {
let mut r2 = m
.iter()
.zip(e2)
.zip(c2)
.map(|((&m, &e2), &c2)| delta as i128 * m as i128 + e2 as i128 - c2 as i128)
.collect::<Box<[_]>>();
{
for (i, r2) in r2.iter_mut().enumerate() {
let mut dot = 0i128;
for j in 0..d {
let b = if i + j < d {
b[d - j - i - 1] as i128
} else {
-(b[2 * d - j - i - 1] as i128)
};
dot += r[d - j - 1] as i128 * b;
}
*r2 += dot;
*r2 /= decoded_q as i128;
}
}
r2.into_vec().into_iter().map(|r2| r2 as i64).collect()
}
impl<G: Curve> Compressible for GroupElements<G>
where
GroupElements<G>:
@@ -246,6 +370,7 @@ mod test {
}
/// A randomly generated testcase of pke encryption
#[derive(Clone)]
pub(super) struct PkeTestcase {
pub(super) a: Vec<i64>,
pub(super) e1: Vec<i64>,
@@ -254,7 +379,7 @@ mod test {
pub(super) m: Vec<i64>,
pub(super) b: Vec<i64>,
pub(super) metadata: [u8; METADATA_LEN],
s: Vec<i64>,
pub(super) s: Vec<i64>,
}
impl PkeTestcase {
@@ -313,7 +438,7 @@ mod test {
}
}
/// Encrypt using compact pke
/// Encrypt using compact pke, the encryption is validated by doing a decryption
pub(super) fn encrypt(&self, params: PkeTestParameters) -> PkeTestCiphertext {
let PkeTestParameters {
d,
@@ -324,6 +449,47 @@ mod test {
msbs_zero_padding_bit_count: _msbs_zero_padding_bit_count,
} = params;
let ct = self.encrypt_unchecked(params);
// Check decryption
let mut m_decrypted = vec![0i64; k];
for (i, decrypted) in m_decrypted.iter_mut().enumerate() {
let mut dot = 0i128;
for j in 0..d {
let c = if i + j < d {
ct.c1[d - j - i - 1]
} else {
ct.c1[2 * d - j - i - 1].wrapping_neg()
};
dot += self.s[d - j - 1] as i128 * c as i128;
}
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
let val = ((ct.c2[i] as i128).wrapping_sub(dot)) * t as i128;
let div = val.div_euclid(q);
let rem = val.rem_euclid(q);
let result = div as i64 + (rem > (q / 2)) as i64;
let result = result.rem_euclid(params.t as i64);
*decrypted = result;
}
assert_eq!(self.m, m_decrypted);
ct
}
/// Encrypt using compact pke, without checking that the decryption is correct
pub(super) fn encrypt_unchecked(&self, params: PkeTestParameters) -> PkeTestCiphertext {
let PkeTestParameters {
d,
k,
B: _B,
q,
t,
msbs_zero_padding_bit_count: _msbs_zero_padding_bit_count,
} = params;
let delta = {
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
// delta takes the encoding with the padding bit
@@ -355,35 +521,17 @@ mod test {
.wrapping_add((delta * self.m[i] as u64) as i64);
}
// Check decryption
let mut m_roundtrip = vec![0i64; k];
for i in 0..k {
let mut dot = 0i128;
for j in 0..d {
let c = if i + j < d {
c1[d - j - i - 1]
} else {
c1[2 * d - j - i - 1].wrapping_neg()
};
dot += self.s[d - j - 1] as i128 * c as i128;
}
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
let val = ((c2[i] as i128).wrapping_sub(dot)) * t as i128;
let div = val.div_euclid(q);
let rem = val.rem_euclid(q);
let result = div as i64 + (rem > (q / 2)) as i64;
let result = result.rem_euclid(params.t as i64);
m_roundtrip[i] = result;
}
assert_eq!(self.m, m_roundtrip);
PkeTestCiphertext { c1, c2 }
}
}
/// Expected result of the verification for a test
#[derive(Copy, Clone, Debug, PartialEq)]
pub(super) enum VerificationResult {
Accept,
Reject,
}
/// Return a point with coordinates (x, y) that is randomly chosen and not on the curve
pub(super) fn point_not_on_curve<Config: short_weierstrass::SWCurveConfig>(
rng: &mut StdRng,

View File

@@ -475,6 +475,24 @@ pub fn prove<G: Curve>(
metadata: &[u8],
load: ComputeLoad,
rng: &mut dyn RngCore,
) -> Proof<G> {
prove_impl(
public,
private_commit,
metadata,
load,
rng,
ProofSanityCheckMode::Panic,
)
}
fn prove_impl<G: Curve>(
public: (&PublicParams<G>, &PublicCommit<G>),
private_commit: &PrivateCommit<G>,
metadata: &[u8],
load: ComputeLoad,
rng: &mut dyn RngCore,
sanity_check_mode: ProofSanityCheckMode,
) -> Proof<G> {
let &PublicParams {
ref g_lists,
@@ -503,20 +521,23 @@ pub fn prove<G: Curve>(
let PrivateCommit { r, e1, m, e2, .. } = private_commit;
let k = c2.len();
assert!(k <= k_max);
let effective_t_for_decomposition = t >> msbs_zero_padding_bit_count;
let decoded_q = decode_q(q);
let big_d = d
+ k * effective_t_for_decomposition.ilog2() as usize
+ (d + k) * (2 + b_i.ilog2() as usize + b_r.ilog2() as usize);
assert!(big_d <= big_d_max);
if sanity_check_mode == ProofSanityCheckMode::Panic {
assert_pke_proof_preconditions(c1, e1, c2, e2, d, k_max, big_d, big_d_max);
}
// FIXME: div_round
let delta = {
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
// delta takes the encoding with the padding bit
(q / t as i128) as u64
(decoded_q / t as u128) as u64
};
let g = G::G1::GENERATOR;
@@ -524,72 +545,8 @@ pub fn prove<G: Curve>(
let gamma = G::Zp::rand(rng);
let gamma_y = G::Zp::rand(rng);
// rot(a) phi(r) + phi(e1) - q phi(r1) = phi(c1)
// phi[d - i + 1](bar(b)).T phi(r) + delta m_i + e2_i - q r2_i = c2
// phi(r1) = (rot(a) phi(r) + phi(e1) - phi(c1)) / q
// r2_i = (phi[d - i + 1](bar(b)).T phi(r) + delta m_i + e2_i - c2) / q
let mut r1 = e1
.iter()
.zip(c1.iter())
.map(|(&e1, &c1)| e1 as i128 - c1 as i128)
.collect::<Box<_>>();
for i in 0..d {
for j in 0..d {
if i + j < d {
r1[i + j] += a[i] as i128 * r[d - j - 1] as i128;
} else {
r1[i + j - d] -= a[i] as i128 * r[d - j - 1] as i128;
}
}
}
{
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
for r1 in &mut *r1 {
*r1 /= q;
}
}
let mut r2 = m
.iter()
.zip(e2)
.zip(c2)
.map(|((&m, &e2), &c2)| delta as i128 * m as i128 + e2 as i128 - c2 as i128)
.collect::<Box<_>>();
{
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
for (i, r2) in r2.iter_mut().enumerate() {
let mut dot = 0i128;
for j in 0..d {
let b = if i + j < d {
b[d - j - i - 1] as i128
} else {
-(b[2 * d - j - i - 1] as i128)
};
dot += r[d - j - 1] as i128 * b;
}
*r2 += dot;
*r2 /= q;
}
}
let r1 = r1
.into_vec()
.into_iter()
.map(|r1| r1 as i64)
.collect::<Box<_>>();
let r2 = r2
.into_vec()
.into_iter()
.map(|r2| r2 as i64)
.collect::<Box<_>>();
let r1 = compute_r1(e1, c1, a, r, d, decoded_q);
let r2 = compute_r2(e2, c2, m, b, r, d, delta, decoded_q);
let mut w = vec![false; n];
@@ -679,7 +636,7 @@ pub fn prove<G: Curve>(
delta,
b_i,
b_r,
q,
decoded_q,
);
let mut t = vec![G::Zp::ZERO; n];
@@ -898,7 +855,7 @@ fn compute_a_theta<G: Curve>(
delta: u64,
b_i: u64,
b_r: u64,
q: u64,
decoded_q: u128,
) {
// a_theta = Ã.T theta0
// = [
@@ -924,11 +881,7 @@ fn compute_a_theta<G: Curve>(
// -q g[1 + log Br].T theta2_k
// ]
let q = if q == 0 {
G::Zp::from_u128(1u128 << 64)
} else {
G::Zp::from_u64(q)
};
let q = G::Zp::from_u128(decoded_q);
let theta1 = &theta0[..d];
let theta2 = &theta0[d..];
@@ -1044,11 +997,12 @@ pub fn verify<G: Curve>(
let b_i = b;
let decoded_q = decode_q(q);
// FIXME: div_round
let delta = {
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
// delta takes the encoding with the padding bit
(q / t as i128) as u64
(decoded_q / t as u128) as u64
};
let PublicCommit { a, b, c1, c2, .. } = public.1;
@@ -1116,7 +1070,7 @@ pub fn verify<G: Curve>(
delta,
b_i,
b_r,
q,
decoded_q,
);
let mut t_theta = G::Zp::ZERO;
@@ -1318,6 +1272,7 @@ mod tests {
msbs_zero_padding_bit_count: 1,
};
/// Test that the proof is rejected if we use a different value between encryption and proof
#[test]
fn test_pke() {
let PkeTestParameters {
@@ -1442,6 +1397,209 @@ mod tests {
}
}
fn prove_and_verify<G: Curve>(
testcase: &PkeTestcase,
crs: &PublicParams<G>,
load: ComputeLoad,
rng: &mut StdRng,
) -> VerificationResult {
let ct = testcase.encrypt_unchecked(PKEV1_TEST_PARAMS);
let (public_commit, private_commit) = commit(
testcase.a.clone(),
testcase.b.clone(),
ct.c1.clone(),
ct.c2.clone(),
testcase.r.clone(),
testcase.e1.clone(),
testcase.m.clone(),
testcase.e2.clone(),
crs,
rng,
);
let proof = prove_impl(
(crs, &public_commit),
&private_commit,
&testcase.metadata,
load,
rng,
ProofSanityCheckMode::Ignore,
);
if verify(&proof, (crs, &public_commit), &testcase.metadata).is_ok() {
VerificationResult::Accept
} else {
VerificationResult::Reject
}
}
fn assert_prove_and_verify<G: Curve>(
testcase: &PkeTestcase,
testcase_name: &str,
crs: &PublicParams<G>,
rng: &mut StdRng,
expected_result: VerificationResult,
) {
for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
assert_eq!(
prove_and_verify(testcase, crs, load, rng),
expected_result,
"Testcase {testcase_name} failed"
)
}
}
/// Test that the proof is rejected if we use a noise outside of the bounds
#[test]
fn test_pke_bad_noise() {
let PkeTestParameters {
d,
k,
B,
q,
t,
msbs_zero_padding_bit_count,
} = PKEV1_TEST_PARAMS;
let rng = &mut StdRng::seed_from_u64(0);
let testcase = PkeTestcase::gen(rng, PKEV1_TEST_PARAMS);
type Curve = curve_api::Bls12_446;
// A CRS where the number of slots = the number of messages to encrypt
let crs = crs_gen::<Curve>(d, k, B, q, t, msbs_zero_padding_bit_count, rng);
// A CRS where the number of slots is bigger than the number of messages to encrypt
let big_crs_k = k + 1 + (rng.gen::<usize>() % (d - k));
let crs_bigger_k =
crs_gen::<Curve>(d, big_crs_k, B, q, t, msbs_zero_padding_bit_count, rng);
// ==== Generate test noise vectors with random coeffs and one completely out of bounds ===
let mut testcase_bad_e1 = testcase.clone();
let bad_idx = rng.gen::<usize>() % d;
// Generate a value between B + 1 and i64::MAX to make sure that it is out of bounds
let bad_term = (rng.gen::<u64>() % (i64::MAX as u64 - (B + 1))) + (B + 1);
let bad_term = bad_term as i64;
testcase_bad_e1.e1[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
let mut testcase_bad_e2 = testcase.clone();
let bad_idx = rng.gen::<usize>() % k;
testcase_bad_e2.e2[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
// ==== Generate test noise vectors with random coeffs and one just around the bound ===
// Check slightly out of bound noise
let bad_term = (B + 1) as i64;
let mut testcase_after_bound_e1 = testcase.clone();
let bad_idx = rng.gen::<usize>() % d;
testcase_after_bound_e1.e1[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
let mut testcase_after_bound_e2 = testcase.clone();
let bad_idx = rng.gen::<usize>() % k;
testcase_after_bound_e2.e2[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
// Check noise right on the bound
let bad_term = B as i64;
let mut testcase_on_bound_positive_e1 = testcase.clone();
let bad_idx = rng.gen::<usize>() % d;
testcase_on_bound_positive_e1.e1[bad_idx] = bad_term;
let mut testcase_on_bound_positive_e2 = testcase.clone();
let bad_idx = rng.gen::<usize>() % k;
testcase_on_bound_positive_e2.e2[bad_idx] = bad_term;
let mut testcase_on_bound_negative_e1 = testcase.clone();
let bad_idx = rng.gen::<usize>() % d;
testcase_on_bound_negative_e1.e1[bad_idx] = -bad_term;
let mut testcase_on_bound_negative_e2 = testcase.clone();
let bad_idx = rng.gen::<usize>() % k;
testcase_on_bound_negative_e2.e2[bad_idx] = -bad_term;
// Check just before the limit
let bad_term = (B - 1) as i64;
let mut testcase_before_bound_e1 = testcase.clone();
let bad_idx = rng.gen::<usize>() % d;
testcase_before_bound_e1.e1[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
let mut testcase_before_bound_e2 = testcase.clone();
let bad_idx = rng.gen::<usize>() % k;
testcase_before_bound_e2.e2[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
for (testcase, name, expected_result) in [
(
testcase_bad_e1,
stringify!(testcase_bad_e1),
VerificationResult::Reject,
),
(
testcase_bad_e2,
stringify!(testcase_bad_e2),
VerificationResult::Reject,
),
(
testcase_after_bound_e1,
stringify!(testcase_after_bound_e1),
VerificationResult::Reject,
),
(
testcase_after_bound_e2,
stringify!(testcase_after_bound_e2),
VerificationResult::Reject,
),
// Upper bound is refused and lower bound is accepted
(
testcase_on_bound_positive_e1,
stringify!(testcase_on_bound_positive_e1),
VerificationResult::Reject,
),
(
testcase_on_bound_positive_e2,
stringify!(testcase_on_bound_positive_e2),
VerificationResult::Reject,
),
(
testcase_on_bound_negative_e1,
stringify!(testcase_on_bound_negative_e1),
VerificationResult::Accept,
),
(
testcase_on_bound_negative_e2,
stringify!(testcase_on_bound_negative_e2),
VerificationResult::Accept,
),
(
testcase_before_bound_e1,
stringify!(testcase_before_bound_e1),
VerificationResult::Accept,
),
(
testcase_before_bound_e2,
stringify!(testcase_before_bound_e2),
VerificationResult::Accept,
),
] {
assert_prove_and_verify(&testcase, name, &crs, rng, expected_result);
assert_prove_and_verify(&testcase, name, &crs_bigger_k, rng, expected_result);
}
}
/// Test that the proof is rejected if we don't have the padding bit set to 0
#[test]
fn test_pke_w_padding_fail_verify() {
let PkeTestParameters {
@@ -1518,6 +1676,7 @@ mod tests {
}
}
/// Test compression of proofs
#[test]
fn test_proof_compression() {
let PkeTestParameters {
@@ -1570,6 +1729,7 @@ mod tests {
}
}
/// Test the `is_usable` method, that checks the correctness of the EC points in the proof
#[test]
fn test_proof_usable() {
let PkeTestParameters {

View File

@@ -3,6 +3,7 @@
use super::*;
use crate::backward_compatibility::pke_v2::{CompressedProofVersions, ProofVersions};
use crate::backward_compatibility::BoundVersions;
use crate::curve_api::{CompressedG1, CompressedG2};
use crate::four_squares::*;
use crate::serialization::{
@@ -35,13 +36,13 @@ pub struct PublicParams<G: Curve> {
pub n: usize,
pub d: usize,
pub k: usize,
pub B: u64,
pub B_r: u64,
pub B_bound: u64,
pub m_bound: usize,
// We store the square of the bound to avoid rounding on sqrt operations
pub B_bound_squared: u128,
pub B_inf: u64,
pub q: u64,
pub t: u64,
pub msbs_zero_padding_bit_count: u64,
pub bound_type: Bound,
pub(crate) hash: [u8; HASH_METADATA_LEN_BYTES],
pub(crate) hash_R: [u8; HASH_METADATA_LEN_BYTES],
pub(crate) hash_t: [u8; HASH_METADATA_LEN_BYTES],
@@ -72,13 +73,12 @@ where
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash,
hash_R,
hash_t,
@@ -96,13 +96,12 @@ where
n: *n,
d: *d,
k: *k,
B: *B,
B_r: *B_r,
B_bound: *B_bound,
m_bound: *m_bound,
B_inf: *B_inf,
B_bound_squared: *B_bound_squared,
q: *q,
t: *t,
msbs_zero_padding_bit_count: *msbs_zero_padding_bit_count,
bound_type: *bound_type,
hash: hash.to_vec(),
hash_R: hash_R.to_vec(),
hash_t: hash_t.to_vec(),
@@ -123,13 +122,12 @@ where
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash,
hash_R,
hash_t,
@@ -147,13 +145,12 @@ where
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash: try_vec_to_array(hash)?,
hash_R: try_vec_to_array(hash_R)?,
hash_t: try_vec_to_array(hash_t)?,
@@ -175,11 +172,11 @@ impl<G: Curve> PublicParams<G> {
g_hat_list: Vec<Affine<G::Zp, G::G2>>,
d: usize,
k: usize,
B: u64,
B_inf: u64,
q: u64,
t: u64,
msbs_zero_padding_bit_count: u64,
bound: Bound,
bound_type: Bound,
hash: [u8; HASH_METADATA_LEN_BYTES],
hash_R: [u8; HASH_METADATA_LEN_BYTES],
hash_t: [u8; HASH_METADATA_LEN_BYTES],
@@ -191,21 +188,21 @@ impl<G: Curve> PublicParams<G> {
hash_z: [u8; HASH_METADATA_LEN_BYTES],
hash_chi: [u8; HASH_METADATA_LEN_BYTES],
) -> Self {
let (n, D, B_r, B_bound, m_bound) =
compute_crs_params(d, k, B, q, t, msbs_zero_padding_bit_count, bound);
let B_squared = inf_norm_bound_to_euclidean_squared(B_inf, d + k);
let (n, D, B_bound_squared, _) =
compute_crs_params(d, k, B_squared, t, msbs_zero_padding_bit_count, bound_type);
Self {
g_lists: GroupElements::<G>::from_vec(g_list, g_hat_list),
D,
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash,
hash_R,
hash_t,
@@ -220,7 +217,9 @@ impl<G: Curve> PublicParams<G> {
}
pub fn exclusive_max_noise(&self) -> u64 {
self.B
// Here we return the bound without slack because users aren't supposed to generate noise
// inside the slack
self.B_inf + 1
}
/// Check if the crs can be used to generate or verify a proof
@@ -478,72 +477,90 @@ pub struct PrivateCommit<G: Curve> {
__marker: PhantomData<G>,
}
#[derive(Copy, Clone, Debug)]
#[derive(PartialEq, Copy, Clone, Debug, Serialize, Deserialize, Versionize)]
#[versionize(BoundVersions)]
pub enum Bound {
GHL,
CS,
}
fn ceil_ilog2(value: u128) -> u64 {
value.ilog2() as u64 + if value.is_power_of_two() { 0 } else { 1 }
}
pub fn compute_crs_params(
d: usize,
k: usize,
B: u64,
_q: u64, // we keep q here to make sure the API is consistent with [crs_gen]
B_squared: u128,
t: u64,
msbs_zero_padding_bit_count: u64,
bound: Bound,
) -> (usize, usize, u64, u64, usize) {
let B_r = d as u64 / 2 + 1;
let B_bound = {
let B = B as f64;
let d = d as f64;
let k = k as f64;
bound_type: Bound,
) -> (usize, usize, u128, usize) {
let mut B_bound_squared = {
(match bound_type {
// GHL factor is 9.75, 9.75**2 = 95.0625
// Result is multiplied and divided by 10000 to avoid floating point operations
Bound::GHL => 950625,
Bound::CS => (2 * (d + k) + 4) as u128,
}) * (B_squared + (sqr(d + 2) * (d + k)) as u128 / 4)
};
(match bound {
Bound::GHL => 9.75,
Bound::CS => f64::sqrt(2.0 * (d + k) + 4.0),
}) * f64::sqrt(sqr(B) + (sqr(d + 2.0) * (d + k)) / 4.0)
if bound_type == Bound::GHL {
B_bound_squared = B_bound_squared.div_ceil(10000);
}
.ceil() as u64;
// Formula is round_up(1 + B_bound.ilog2()) so we convert it to +2
let m_bound = 2 + B_bound.ilog2() as usize;
// Formula is round_up(1 + B_bound.ilog2()).
// Since we use B_bound_square, the log is divided by 2
let m_bound = 1 + ceil_ilog2(B_bound_squared).div_ceil(2) as usize;
// This is also the effective t for encryption
let effective_t_for_decomposition = t >> msbs_zero_padding_bit_count;
let D = d + k * effective_t_for_decomposition.ilog2() as usize;
// formula in Prove_pp: 2.
let D = d + k * (effective_t_for_decomposition.ilog2() as usize);
let n = D + 128 * m_bound;
(n, D, B_r, B_bound, m_bound)
(n, D, B_bound_squared, m_bound)
}
/// Convert a bound on the infinite norm of a vector into a bound on the square of the euclidean
/// norm.
///
/// Use the relationship: `||x||_2 <= sqrt(dim)*||x||_inf`. Since we are only interested in the
/// squared bound, we avoid the sqrt by returning dim*(||x||_inf)^2.
fn inf_norm_bound_to_euclidean_squared(B_inf: u64, dim: usize) -> u128 {
let norm_squared = sqr(B_inf) as u128;
norm_squared * dim as u128
}
/// Generates a CRS based on the bound the heuristic provided by the lemma 2 of the paper.
pub fn crs_gen_ghl<G: Curve>(
d: usize,
k: usize,
B: u64,
B_inf: u64,
q: u64,
t: u64,
msbs_zero_padding_bit_count: u64,
rng: &mut dyn RngCore,
) -> PublicParams<G> {
let bound_type = Bound::GHL;
let alpha = G::Zp::rand(rng);
let B = B * (isqrt((d + k) as _) as u64 + 1);
let (n, D, B_r, B_bound, m_bound) =
compute_crs_params(d, k, B, q, t, msbs_zero_padding_bit_count, Bound::GHL);
let B_squared = inf_norm_bound_to_euclidean_squared(B_inf, d + k);
let (n, D, B_bound_squared, _) =
compute_crs_params(d, k, B_squared, t, msbs_zero_padding_bit_count, bound_type);
PublicParams {
g_lists: GroupElements::<G>::new(n, alpha),
D,
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_inf,
B_bound_squared,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash: core::array::from_fn(|_| rng.gen()),
hash_R: core::array::from_fn(|_| rng.gen()),
hash_t: core::array::from_fn(|_| rng.gen()),
@@ -562,29 +579,29 @@ pub fn crs_gen_ghl<G: Curve>(
pub fn crs_gen_cs<G: Curve>(
d: usize,
k: usize,
B: u64,
B_inf: u64,
q: u64,
t: u64,
msbs_zero_padding_bit_count: u64,
rng: &mut dyn RngCore,
) -> PublicParams<G> {
let bound_type = Bound::CS;
let alpha = G::Zp::rand(rng);
let B = B * (isqrt((d + k) as _) as u64 + 1);
let (n, D, B_r, B_bound, m_bound) =
compute_crs_params(d, k, B, q, t, msbs_zero_padding_bit_count, Bound::CS);
let B_squared = inf_norm_bound_to_euclidean_squared(B_inf, d + k);
let (n, D, B_bound_squared, _) =
compute_crs_params(d, k, B_squared, t, msbs_zero_padding_bit_count, bound_type);
PublicParams {
g_lists: GroupElements::<G>::new(n, alpha),
D,
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash: core::array::from_fn(|_| rng.gen()),
hash_R: core::array::from_fn(|_| rng.gen()),
hash_t: core::array::from_fn(|_| rng.gen()),
@@ -654,6 +671,24 @@ pub fn prove<G: Curve>(
metadata: &[u8],
load: ComputeLoad,
rng: &mut dyn RngCore,
) -> Proof<G> {
prove_impl(
public,
private_commit,
metadata,
load,
rng,
ProofSanityCheckMode::Panic,
)
}
fn prove_impl<G: Curve>(
public: (&PublicParams<G>, &PublicCommit<G>),
private_commit: &PrivateCommit<G>,
metadata: &[u8],
load: ComputeLoad,
rng: &mut dyn RngCore,
sanity_check_mode: ProofSanityCheckMode,
) -> Proof<G> {
_ = load;
let (
@@ -663,13 +698,12 @@ pub fn prove<G: Curve>(
n,
d,
k: k_max,
B,
B_r: _,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t: t_input,
msbs_zero_padding_bit_count,
bound_type,
ref hash,
ref hash_R,
ref hash_t,
@@ -689,20 +723,40 @@ pub fn prove<G: Curve>(
let PrivateCommit { r, e1, m, e2, .. } = private_commit;
let k = c2.len();
assert!(k <= k_max);
let effective_cleartext_t = t_input >> msbs_zero_padding_bit_count;
// Recompute the D for our case if k is smaller than the k max
// formula in Prove_pp: 2.
let D = d + k * effective_cleartext_t.ilog2() as usize;
assert!(D <= D_max);
let decoded_q = decode_q(q);
// Recompute some params for our case if k is smaller than the k max
let B_squared = inf_norm_bound_to_euclidean_squared(B_inf, d + k);
let (_, D, _, m_bound) = compute_crs_params(
d,
k,
B_squared,
t_input,
msbs_zero_padding_bit_count,
bound_type,
);
let e_sqr_norm = e1
.iter()
.chain(e2)
.map(|x| sqr(x.unsigned_abs() as u128))
.sum::<u128>();
if sanity_check_mode == ProofSanityCheckMode::Panic {
assert_pke_proof_preconditions(c1, e1, c2, e2, d, k_max, D, D_max);
assert!(
B_squared >= e_sqr_norm,
"squared norm of error ({e_sqr_norm}) exceeds threshold ({B_squared})",
);
}
// FIXME: div_round
let delta = {
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
// delta takes the encoding with the padding bit
(q / t_input as i128) as u64
(decoded_q / t_input as u128) as u64
};
let g = G::G1::GENERATOR;
@@ -714,75 +768,8 @@ pub fn prove<G: Curve>(
let gamma_bin = G::Zp::rand(rng);
let gamma_y = G::Zp::rand(rng);
// eq (11)
// (phi is simply the function that maps a polynomial to its coeffs vector)
// rot(a) * phi(bar(r)) - q phi(r1) + phi(e1) = phi(c1)
// phi_[d - i](b).T * phi(bar(r)) + delta * m_i - q r2_i + e2_i = c2_i
// implies
// phi(r1) = (rot(a) * phi(bar(r)) + phi(e1) - phi(c1)) / q
// r2_i = (phi_[d - i](b).T * phi(bar(r)) + delta * m_i + e2_i - c2_i) / q
let mut r1 = e1
.iter()
.zip(c1.iter())
.map(|(&e1, &c1)| e1 as i128 - c1 as i128)
.collect::<Box<[_]>>();
for i in 0..d {
for j in 0..d {
if i + j < d {
r1[i + j] += a[i] as i128 * r[d - j - 1] as i128;
} else {
r1[i + j - d] -= a[i] as i128 * r[d - j - 1] as i128;
}
}
}
{
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
for r1 in &mut *r1 {
*r1 /= q;
}
}
let mut r2 = m
.iter()
.zip(e2)
.zip(c2)
.map(|((&m, &e2), &c2)| delta as i128 * m as i128 + e2 as i128 - c2 as i128)
.collect::<Box<[_]>>();
{
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
for (i, r2) in r2.iter_mut().enumerate() {
let mut dot = 0i128;
for j in 0..d {
let b = if i + j < d {
b[d - j - i - 1] as i128
} else {
-(b[2 * d - j - i - 1] as i128)
};
dot += r[d - j - 1] as i128 * b;
}
*r2 += dot;
*r2 /= q;
}
}
let r1 = &*r1
.into_vec()
.into_iter()
.map(|r1| r1 as i64)
.collect::<Box<[_]>>();
let r2 = &*r2
.into_vec()
.into_iter()
.map(|r2| r2 as i64)
.collect::<Box<[_]>>();
let r1 = compute_r1(e1, c1, a, r, d, decoded_q);
let r2 = compute_r2(e2, c2, m, b, r, d, delta, decoded_q);
let u64 = |x: i64| x as u64;
@@ -796,19 +783,7 @@ pub fn prove<G: Curve>(
)
.collect::<Box<[_]>>();
let e_sqr_norm = e1
.iter()
.chain(e2)
.map(|x| sqr(x.unsigned_abs() as u128))
.sum::<u128>();
assert!(
sqr(B as u128) >= e_sqr_norm,
"squared norm of error ({e_sqr_norm}) exceeds threshold ({})",
sqr(B as u128)
);
let v = four_squares(sqr(B as u128) - e_sqr_norm).map(|v| v as i64);
let v = four_squares(B_squared - e_sqr_norm).map(|v| v as i64);
let e1_zp = &*e1
.iter()
@@ -860,7 +835,7 @@ pub fn prove<G: Curve>(
let x_bytes = &*[
q.to_le_bytes().as_slice(),
(d as u64).to_le_bytes().as_slice(),
B.to_le_bytes().as_slice(),
B_squared.to_le_bytes().as_slice(),
t_input.to_le_bytes().as_slice(),
msbs_zero_padding_bit_count.to_le_bytes().as_slice(),
&*a.iter()
@@ -928,8 +903,8 @@ pub fn prove<G: Curve>(
e1.iter()
.chain(e2)
.chain(&v)
.chain(r1)
.chain(r2)
.chain(&r1)
.chain(&r2)
.copied()
.enumerate()
.for_each(|(j, x)| match R(j) {
@@ -938,7 +913,9 @@ pub fn prove<G: Curve>(
-1 => acc -= x as i128,
_ => unreachable!(),
});
assert!(acc.unsigned_abs() <= B_bound as u128);
if sanity_check_mode == ProofSanityCheckMode::Panic {
assert!(sqr(acc) as u128 <= B_bound_squared);
}
acc as i64
})
.collect::<Box<[_]>>();
@@ -1036,7 +1013,9 @@ pub fn prove<G: Curve>(
.flat_map(|x| x.to_le_bytes().as_ref().to_vec())
.collect::<Box<[_]>>();
assert_eq!(y.len(), w_bin.len());
if sanity_check_mode == ProofSanityCheckMode::Panic {
assert_eq!(y.len(), w_bin.len());
}
let scalars = y
.iter()
.zip(w_bin.iter())
@@ -1272,8 +1251,7 @@ pub fn prove<G: Curve>(
*p = r2_zp[j];
}
let delta_theta_q =
delta_theta * G::Zp::from_u128(if q == 0 { 1u128 << 64 } else { q as u128 });
let delta_theta_q = delta_theta * G::Zp::from_u128(decoded_q);
for j in 0..d + k {
let p = &mut poly_2_rhs[n - j];
@@ -1401,7 +1379,7 @@ pub fn prove<G: Curve>(
}
let mut P_pi = poly_0;
if P_pi.len() > n + 1 {
P_pi[n + 1] -= delta_theta * t_theta + delta_l * sqr(G::Zp::from_u64(B));
P_pi[n + 1] -= delta_theta * t_theta + delta_l * G::Zp::from_u128(B_squared);
}
let pi = if P_pi.is_empty() {
@@ -1851,13 +1829,12 @@ pub fn verify<G: Curve>(
n,
d,
k: k_max,
B,
B_r: _,
B_bound: _,
m_bound: m,
B_bound_squared: _,
B_inf,
q,
t: t_input,
msbs_zero_padding_bit_count,
bound_type,
ref hash,
ref hash_R,
ref hash_t,
@@ -1872,11 +1849,12 @@ pub fn verify<G: Curve>(
let g_list = &*g_lists.g_list.0;
let g_hat_list = &*g_lists.g_hat_list.0;
let decoded_q = decode_q(q);
// FIXME: div_round
let delta = {
let q = if q == 0 { 1i128 << 64 } else { q as i128 };
// delta takes the encoding with the padding bit
(q / t_input as i128) as u64
(decoded_q / t_input as u128) as u64
};
let PublicCommit { a, b, c1, c2, .. } = public.1;
@@ -1886,10 +1864,18 @@ pub fn verify<G: Curve>(
}
let effective_cleartext_t = t_input >> msbs_zero_padding_bit_count;
let B_squared = inf_norm_bound_to_euclidean_squared(B_inf, d + k);
let (_, D, _, m_bound) = compute_crs_params(
d,
k,
B_squared,
t_input,
msbs_zero_padding_bit_count,
bound_type,
);
let m = m_bound;
// Recompute the D for our case if k is smaller than the k max
// formula in Prove_pp: 2.
let D = d + k * effective_cleartext_t.ilog2() as usize;
if D > D_max {
return Err(());
}
@@ -1912,7 +1898,7 @@ pub fn verify<G: Curve>(
let x_bytes = &*[
q.to_le_bytes().as_slice(),
(d as u64).to_le_bytes().as_slice(),
B.to_le_bytes().as_slice(),
B_squared.to_le_bytes().as_slice(),
t_input.to_le_bytes().as_slice(),
msbs_zero_padding_bit_count.to_le_bytes().as_slice(),
&*a.iter()
@@ -2150,8 +2136,7 @@ pub fn verify<G: Curve>(
let g = G::G1::GENERATOR;
let g_hat = G::G2::GENERATOR;
let delta_theta_q =
delta_theta * G::Zp::from_u128(if q == 0 { 1u128 << 64 } else { q as u128 });
let delta_theta_q = delta_theta * G::Zp::from_u128(decoded_q);
let rhs = pairing(pi, g_hat);
let lhs = {
@@ -2210,7 +2195,7 @@ pub fn verify<G: Curve>(
G::G1::projective(g_list[0]),
G::G2::projective(g_hat_list[n - 1]),
)
.mul_scalar(delta_theta * t_theta + delta_l * sqr(G::Zp::from_u64(B)));
.mul_scalar(delta_theta * t_theta + delta_l * G::Zp::from_u128(B_squared));
lhs0 + lhs1 + lhs2 - lhs3 - lhs4 - lhs5 - lhs6
};
@@ -2437,6 +2422,7 @@ mod tests {
msbs_zero_padding_bit_count: 1,
};
/// Test that the proof is rejected if we use a different value between encryption and proof
#[test]
fn test_pke() {
let PkeTestParameters {
@@ -2560,6 +2546,356 @@ mod tests {
}
}
fn prove_and_verify<G: Curve>(
testcase: &PkeTestcase,
crs: &PublicParams<G>,
load: ComputeLoad,
rng: &mut StdRng,
) -> VerificationResult {
let ct = testcase.encrypt_unchecked(PKEV2_TEST_PARAMS);
let (public_commit, private_commit) = commit(
testcase.a.clone(),
testcase.b.clone(),
ct.c1.clone(),
ct.c2.clone(),
testcase.r.clone(),
testcase.e1.clone(),
testcase.m.clone(),
testcase.e2.clone(),
crs,
rng,
);
let proof = prove_impl(
(crs, &public_commit),
&private_commit,
&testcase.metadata,
load,
rng,
ProofSanityCheckMode::Ignore,
);
if verify(&proof, (crs, &public_commit), &testcase.metadata).is_ok() {
VerificationResult::Accept
} else {
VerificationResult::Reject
}
}
fn assert_prove_and_verify<G: Curve>(
testcase: &PkeTestcase,
testcase_name: &str,
crs: &PublicParams<G>,
rng: &mut StdRng,
expected_result: VerificationResult,
) {
for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
assert_eq!(
prove_and_verify(testcase, crs, load, rng),
expected_result,
"Testcase {testcase_name} failed"
)
}
}
#[derive(Clone, Copy)]
enum BoundTestSlackMode {
/// Generate test noise vectors with all coeffs at 0 except one
// Here ||e||inf == ||e||2 so the slack is the biggest, since B is multiplied by
// sqrt(d+k) anyways
Max,
/// Generate test noise vectors with random coeffs and one just around the bound
// Here the slack should be "average"
Avg,
/// Generate test noise vectors with all coeffs equals to B except one at +/-1
// Here the slack should be minimal since ||e||_2 = sqrt(d+k)*||e||_inf, which is exactly
// what we are proving.
Min,
}
impl Display for BoundTestSlackMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
BoundTestSlackMode::Min => write!(f, "min_slack"),
BoundTestSlackMode::Avg => write!(f, "avg_slack"),
BoundTestSlackMode::Max => write!(f, "max_slack"),
}
}
}
#[derive(Clone, Copy)]
enum TestedCoeffOffsetType {
/// Noise term is after the bound, the proof should be refused
After,
/// Noise term is right on the bound, the proof should be accepted
On,
/// Noise term is before the bound, the proof should be accepted
Before,
}
impl Display for TestedCoeffOffsetType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TestedCoeffOffsetType::After => write!(f, "after_bound"),
TestedCoeffOffsetType::On => write!(f, "on_bound"),
TestedCoeffOffsetType::Before => write!(f, "before_bound"),
}
}
}
impl TestedCoeffOffsetType {
fn offset(self) -> i64 {
match self {
TestedCoeffOffsetType::After => 1,
TestedCoeffOffsetType::On => 0,
TestedCoeffOffsetType::Before => -1,
}
}
fn expected_result(self) -> VerificationResult {
match self {
TestedCoeffOffsetType::After => VerificationResult::Reject,
TestedCoeffOffsetType::On => VerificationResult::Accept,
TestedCoeffOffsetType::Before => VerificationResult::Accept,
}
}
}
#[derive(Clone, Copy)]
enum TestedCoeffType {
E1,
E2,
}
impl Display for TestedCoeffType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TestedCoeffType::E1 => write!(f, "e1"),
TestedCoeffType::E2 => write!(f, "e2"),
}
}
}
struct PkeBoundTestcase {
name: String,
testcase: PkeTestcase,
expected_result: VerificationResult,
}
impl PkeBoundTestcase {
fn new(
ref_testcase: &PkeTestcase,
B: u64,
slack_mode: BoundTestSlackMode,
offset_type: TestedCoeffOffsetType,
coeff_type: TestedCoeffType,
rng: &mut StdRng,
) -> Self {
let mut testcase = ref_testcase.clone();
let d = testcase.e1.len();
let k = testcase.e2.len();
// Select a random index for the tested term
let tested_idx = match coeff_type {
TestedCoeffType::E1 => rng.gen::<usize>() % d,
TestedCoeffType::E2 => rng.gen::<usize>() % k,
};
// Initialize the "good" terms of the error, that are not above the bound
match slack_mode {
BoundTestSlackMode::Max => {
// In this mode, all the terms are 0 except the tested one
testcase.e1 = vec![0; d];
testcase.e2 = vec![0; k];
}
BoundTestSlackMode::Avg => {
// In this mode we keep the original random vector
}
BoundTestSlackMode::Min => {
// In this mode all the terms are exactly at the bound
let good_term = B as i64;
testcase.e1 = (0..d)
.map(|_| if rng.gen() { good_term } else { -good_term })
.collect();
testcase.e2 = (0..k)
.map(|_| if rng.gen() { good_term } else { -good_term })
.collect();
}
};
let B_with_slack_squared = inf_norm_bound_to_euclidean_squared(B, d + k);
let B_with_slack = isqrt(B_with_slack_squared) as u64;
let bound = match slack_mode {
// The slack is maximal, any term above B+slack should be refused
BoundTestSlackMode::Max => B_with_slack as i64,
// The actual accepted bound depends on the content of the test vector
BoundTestSlackMode::Avg => {
let e_sqr_norm = testcase
.e1
.iter()
.chain(&testcase.e2)
.map(|x| sqr(x.unsigned_abs() as u128))
.sum::<u128>();
let orig_value = match coeff_type {
TestedCoeffType::E1 => testcase.e1[tested_idx],
TestedCoeffType::E2 => testcase.e2[tested_idx],
};
let bound_squared =
B_with_slack_squared - (e_sqr_norm - sqr(orig_value as u128));
isqrt(bound_squared) as i64
}
// There is no slack effect, any term above B should be refused
BoundTestSlackMode::Min => B as i64,
};
let tested_term = bound + offset_type.offset();
match coeff_type {
TestedCoeffType::E1 => testcase.e1[tested_idx] = tested_term,
TestedCoeffType::E2 => testcase.e2[tested_idx] = tested_term,
};
Self {
name: format!("test_{slack_mode}_{offset_type}_{coeff_type}"),
testcase,
expected_result: offset_type.expected_result(),
}
}
}
/// Test that the proof is rejected if we use a noise outside of the bounds, taking the slack
/// into account
#[test]
fn test_pke_bad_noise() {
let PkeTestParameters {
d,
k,
B,
q,
t,
msbs_zero_padding_bit_count,
} = PKEV2_TEST_PARAMS;
let rng = &mut StdRng::seed_from_u64(0);
let testcase = PkeTestcase::gen(rng, PKEV2_TEST_PARAMS);
type Curve = curve_api::Bls12_446;
let crs = crs_gen::<Curve>(d, k, B, q, t, msbs_zero_padding_bit_count, rng);
let crs_max_k = crs_gen::<Curve>(d, d, B, q, t, msbs_zero_padding_bit_count, rng);
let B_with_slack_squared = inf_norm_bound_to_euclidean_squared(B, d + k);
let B_with_slack_upper = isqrt(B_with_slack_squared) as u64 + 1;
// Generate test noise vectors with random coeffs and one completely out of bounds
let mut testcases = Vec::new();
let mut testcase_bad_e1 = testcase.clone();
let bad_idx = rng.gen::<usize>() % d;
let bad_term =
(rng.gen::<u64>() % (i64::MAX as u64 - B_with_slack_upper)) + B_with_slack_upper;
let bad_term = bad_term as i64;
testcase_bad_e1.e1[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
testcases.push(PkeBoundTestcase {
name: "testcase_bad_e1".to_string(),
testcase: testcase_bad_e1,
expected_result: VerificationResult::Reject,
});
let mut testcase_bad_e2 = testcase.clone();
let bad_idx = rng.gen::<usize>() % k;
testcase_bad_e2.e2[bad_idx] = if rng.gen() { bad_term } else { -bad_term };
testcases.push(PkeBoundTestcase {
name: "testcase_bad_e2".to_string(),
testcase: testcase_bad_e2,
expected_result: VerificationResult::Reject,
});
// Generate test vectors with a noise term right around the bound
testcases.extend(
itertools::iproduct!(
[
BoundTestSlackMode::Min,
BoundTestSlackMode::Avg,
BoundTestSlackMode::Max
],
[
TestedCoeffOffsetType::Before,
TestedCoeffOffsetType::On,
TestedCoeffOffsetType::After
],
[TestedCoeffType::E1, TestedCoeffType::E2]
)
.map(|(slack_mode, offset_type, coeff_type)| {
PkeBoundTestcase::new(&testcase, B, slack_mode, offset_type, coeff_type, rng)
}),
);
for PkeBoundTestcase {
name,
testcase,
expected_result,
} in testcases
{
assert_prove_and_verify(
&testcase,
&format!("{name}_crs"),
&crs,
rng,
expected_result,
);
assert_prove_and_verify(
&testcase,
&format!("{name}_crs_max_k"),
&crs_max_k,
rng,
expected_result,
);
}
}
/// Compare the computed params with manually calculated ones to check the formula
#[test]
fn test_compute_crs_params() {
let PkeTestParameters {
d,
k,
B,
q: _,
t,
msbs_zero_padding_bit_count,
} = PKEV2_TEST_PARAMS;
let B_squared = inf_norm_bound_to_euclidean_squared(B, d + k);
assert_eq!(B_squared, 40681930227712);
let (n, D, B_bound_squared, m_bound) =
compute_crs_params(d, k, B_squared, t, msbs_zero_padding_bit_count, Bound::GHL);
assert_eq!(n, 6784);
assert_eq!(D, 3328);
assert_eq!(B_bound_squared, 3867562496364372);
assert_eq!(m_bound, 27);
let (n, D, B_bound_squared, m_bound) =
compute_crs_params(d, k, B_squared, t, msbs_zero_padding_bit_count, Bound::CS);
assert_eq!(n, 7168);
assert_eq!(D, 3328);
assert_eq!(B_bound_squared, 192844141830554880);
assert_eq!(m_bound, 30);
}
/// Test that the proof is rejected if we don't have the padding bit set to 0
#[test]
fn test_pke_w_padding_fail_verify() {
let PkeTestParameters {
@@ -2636,6 +2972,7 @@ mod tests {
}
}
/// Test compression of proofs
#[test]
fn test_proof_compression() {
let PkeTestParameters {
@@ -2688,6 +3025,7 @@ mod tests {
}
}
/// Test the `is_usable` method, that checks the correctness of the EC points in the proof
#[test]
fn test_proof_usable() {
let PkeTestParameters {

View File

@@ -17,7 +17,7 @@ use tfhe_versionable::Versionize;
use crate::curve_api::{Curve, CurveGroupOps};
use crate::proofs::pke::PublicParams as PKEv1PublicParams;
use crate::proofs::pke_v2::PublicParams as PKEv2PublicParams;
use crate::proofs::pke_v2::{Bound, PublicParams as PKEv2PublicParams};
use crate::proofs::GroupElements;
/// Error returned when a conversion from a vec to a fixed size array failed because the vec size is
@@ -397,13 +397,12 @@ pub struct SerializablePKEv2PublicParams {
pub n: usize,
pub d: usize,
pub k: usize,
pub B: u64,
pub B_r: u64,
pub B_bound: u64,
pub m_bound: usize,
pub B_bound_squared: u128,
pub B_inf: u64,
pub q: u64,
pub t: u64,
pub msbs_zero_padding_bit_count: u64,
pub bound_type: Bound,
// We use Vec<u8> since serde does not support fixed size arrays of 256 elements
pub(crate) hash: Vec<u8>,
pub(crate) hash_R: Vec<u8>,
@@ -428,13 +427,12 @@ where
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash,
hash_R,
hash_t,
@@ -452,13 +450,12 @@ where
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash: hash.to_vec(),
hash_R: hash_R.to_vec(),
hash_t: hash_t.to_vec(),
@@ -487,13 +484,12 @@ where
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash,
hash_R,
hash_t,
@@ -511,13 +507,12 @@ where
n,
d,
k,
B,
B_r,
B_bound,
m_bound,
B_bound_squared,
B_inf,
q,
t,
msbs_zero_padding_bit_count,
bound_type,
hash: try_vec_to_array(hash)?,
hash_R: try_vec_to_array(hash_R)?,
hash_t: try_vec_to_array(hash_t)?,

View File

@@ -141,6 +141,7 @@ generator_aarch64_aes = ["tfhe-csprng/generator_aarch64_aes"]
# Private features
__profiling = []
__long_run_tests = []
seeder_unix = ["tfhe-csprng/seeder_unix"]
seeder_x86_64_rdseed = ["tfhe-csprng/seeder_x86_64_rdseed"]

View File

@@ -111,7 +111,7 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
level: DecompositionLevelCount,
num_samples: u32,
) {
let lut_count = 1u32;
let num_many_lut = 1u32;
let lut_stride = 0u32;
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
scratch_cuda_programmable_bootstrap_64(
@@ -141,7 +141,7 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
base_log.0 as u32,
level.0 as u32,
num_samples,
lut_count,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap(
@@ -175,7 +175,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
grouping_factor: LweBskGroupingFactor,
num_samples: u32,
) {
let lut_count = 1u32;
let num_many_lut = 1u32;
let lut_stride = 0u32;
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
scratch_cuda_multi_bit_programmable_bootstrap_64(
@@ -206,7 +206,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
base_log.0 as u32,
level.0 as u32,
num_samples,
lut_count,
num_many_lut,
lut_stride,
);
cleanup_cuda_multi_bit_programmable_bootstrap(

View File

@@ -310,21 +310,6 @@ impl CudaRadixCiphertextInfo {
.collect(),
}
}
pub(crate) fn after_bitnot(&self) -> Self {
Self {
blocks: self
.blocks
.iter()
.map(|left| CudaBlockInfo {
degree: Degree::new(left.message_modulus.0 - 1),
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: NoiseLevel::NOMINAL,
})
.collect(),
}
}
pub(crate) fn after_scalar_bitand<T>(&self, scalar: T) -> Self
where
T: DecomposableInto<u8>,

View File

@@ -15,6 +15,7 @@ use crate::shortint::{CarryModulus, MessageModulus};
pub use server_key::CudaServerKey;
use std::cmp::min;
use crate::integer::server_key::radix_parallel::OutputFlag;
use tfhe_cuda_backend::bindings::*;
use tfhe_cuda_backend::cuda_bind::*;
@@ -1016,10 +1017,11 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric>(
pub(crate) unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_input: &mut CudaVec<T>,
carry_out: &mut CudaVec<T>,
carry_in: &CudaVec<T>,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
lwe_dimension: LweDimension,
@@ -1034,6 +1036,8 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
carry_modulus: CarryModulus,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
requested_flag: OutputFlag,
uses_carry: u32,
) {
assert_eq!(
streams.gpu_indexes[0],
@@ -1070,6 +1074,8 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
requested_flag as u32,
uses_carry,
true,
);
cuda_propagate_single_carry_kb_64_inplace(
@@ -1078,10 +1084,13 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
streams.len() as u32,
radix_lwe_input.as_mut_c_ptr(0),
carry_out.as_mut_c_ptr(0),
carry_in.as_c_ptr(0),
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
num_blocks,
requested_flag as u32,
uses_carry,
);
cleanup_cuda_propagate_single_carry(
streams.ptr.as_ptr(),
@@ -1096,14 +1105,12 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
T: UnsignedInteger,
B: Numeric,
>(
pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_input: &mut CudaVec<T>,
radix_lwe_lhs_input: &mut CudaVec<T>,
radix_lwe_rhs_input: &CudaVec<T>,
carry_out: &mut CudaVec<T>,
input_carries: &mut CudaVec<T>,
carry_in: &CudaVec<T>,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
lwe_dimension: LweDimension,
@@ -1118,10 +1125,17 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
carry_modulus: CarryModulus,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
requested_flag: OutputFlag,
uses_carry: u32,
) {
assert_eq!(
streams.gpu_indexes[0],
radix_lwe_input.gpu_index(0),
radix_lwe_lhs_input.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
radix_lwe_rhs_input.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
@@ -1136,7 +1150,7 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
scratch_cuda_propagate_single_carry_kb_64_inplace(
scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
@@ -1154,21 +1168,26 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
requested_flag as u32,
uses_carry,
true,
);
cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
cuda_add_and_propagate_single_carry_kb_64_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
radix_lwe_input.as_mut_c_ptr(0),
radix_lwe_lhs_input.as_mut_c_ptr(0),
radix_lwe_rhs_input.as_c_ptr(0),
carry_out.as_mut_c_ptr(0),
input_carries.as_mut_c_ptr(0),
carry_in.as_c_ptr(0),
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
num_blocks,
requested_flag as u32,
uses_carry,
);
cleanup_cuda_propagate_single_carry(
cleanup_cuda_add_and_propagate_single_carry(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
@@ -2145,108 +2164,6 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async<
T: UnsignedInteger,
B: Numeric,
>(
streams: &CudaStreams,
ct_res: &mut CudaVec<T>,
ct_overflowed: &mut CudaVec<T>,
lhs: &CudaVec<T>,
rhs: &CudaVec<T>,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
big_lwe_dimension: LweDimension,
small_lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
num_blocks: u32,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
) {
assert_eq!(
streams.gpu_indexes[0],
ct_res.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
ct_overflowed.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
lhs.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
rhs.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
bootstrapping_key.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
keyswitch_key.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
scratch_cuda_integer_radix_overflowing_sub_kb_64(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
big_lwe_dimension.0 as u32,
small_lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
num_blocks,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
);
cuda_integer_radix_overflowing_sub_kb_64(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
ct_res.as_mut_c_ptr(0),
ct_overflowed.as_mut_c_ptr(0),
lhs.as_c_ptr(0),
rhs.as_c_ptr(0),
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
num_blocks,
);
cleanup_cuda_integer_radix_overflowing_sub(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
@@ -2356,7 +2273,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
carry_modulus: CarryModulus,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
lut_count: u32,
num_many_lut: u32,
lut_stride: u32,
) {
assert_eq!(
@@ -2410,7 +2327,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
keyswitch_key.ptr.as_ptr(),
bootstrapping_key.ptr.as_ptr(),
num_blocks,
lut_count,
num_many_lut,
lut_stride,
);
cleanup_cuda_apply_univariate_lut_kb_64(
@@ -2587,79 +2504,6 @@ pub unsafe fn unchecked_div_rem_integer_radix_kb_assign_async<T: UnsignedInteger
);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async<
T: UnsignedInteger,
B: Numeric,
>(
streams: &CudaStreams,
lhs: &mut CudaVec<T>,
rhs: &CudaVec<T>,
overflowed: &mut CudaVec<T>,
signed_operation: i8,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
big_lwe_dimension: LweDimension,
small_lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
num_blocks: u32,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
) {
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
big_lwe_dimension.0 as u32,
small_lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
num_blocks,
signed_operation,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
);
cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
lhs.as_mut_c_ptr(0),
rhs.as_c_ptr(0),
overflowed.as_mut_c_ptr(0),
signed_operation,
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
num_blocks,
);
cleanup_signed_overflowing_add_or_sub(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
@@ -2777,6 +2621,98 @@ pub unsafe fn reverse_blocks_inplace_async<T: UnsignedInteger>(
}
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async<
T: UnsignedInteger,
B: Numeric,
>(
streams: &CudaStreams,
radix_lwe_input: &mut CudaVec<T>,
radix_rhs_input: &CudaVec<T>,
carry_out: &mut CudaVec<T>,
carry_in: &CudaVec<T>,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
lwe_dimension: LweDimension,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
num_blocks: u32,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
compute_overflow: bool,
uses_input_borrow: u32,
) {
assert_eq!(
streams.gpu_indexes[0],
radix_lwe_input.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
bootstrapping_key.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
keyswitch_key.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
scratch_cuda_integer_overflowing_sub_kb_64_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
big_lwe_dimension,
lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
num_blocks,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
compute_overflow as u32,
true,
);
cuda_integer_overflowing_sub_kb_64_inplace(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
radix_lwe_input.as_mut_c_ptr(0),
radix_rhs_input.as_c_ptr(0),
carry_out.as_mut_c_ptr(0),
carry_in.as_c_ptr(0),
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
num_blocks,
compute_overflow as u32,
uses_input_borrow,
);
cleanup_cuda_integer_overflowing_sub(
streams.ptr.as_ptr(),
streams.gpu_indexes.as_ptr(),
streams.len() as u32,
std::ptr::addr_of_mut!(mem_ptr),
);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///

View File

@@ -8,17 +8,11 @@ use crate::integer::gpu::ciphertext::{
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{
unchecked_add_integer_radix_assign_async,
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async,
unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async, PBSType,
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType,
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::shortint::ciphertext::NoiseLevel;
#[derive(Copy, Clone, PartialEq, Eq)]
pub(crate) enum SignedOperation {
Addition,
Subtraction,
}
impl CudaServerKey {
/// Computes homomorphically an addition between two ciphertexts encrypting integer values.
///
@@ -114,8 +108,14 @@ impl CudaServerKey {
(ct_left, &tmp_rhs)
}
};
self.unchecked_add_assign_async(lhs, rhs, streams);
let _carry = self.propagate_single_carry_assign_async(lhs, streams);
let _carry = self.add_and_propagate_single_carry_assign_async(
lhs,
rhs,
streams,
None,
OutputFlag::None,
);
}
pub fn add_assign<T: CudaIntegerRadixCiphertext>(
@@ -348,7 +348,7 @@ impl CudaServerKey {
.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
.unwrap();
self.propagate_single_carry_assign_async(&mut result, streams);
self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
assert!(result.block_carries_are_empty());
result
}
@@ -535,8 +535,58 @@ impl CudaServerKey {
rhs: &CudaUnsignedRadixCiphertext,
stream: &CudaStreams,
) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) {
let mut ct_res = self.unchecked_add(lhs, rhs, stream);
let mut carry_out = self.propagate_single_carry_assign_async(&mut ct_res, stream);
let output_flag = OutputFlag::from_signedness(CudaUnsignedRadixCiphertext::IS_SIGNED);
let mut ct_res = lhs.duplicate_async(stream);
let mut carry_out: CudaUnsignedRadixCiphertext = self
.add_and_propagate_single_carry_assign_async(
&mut ct_res,
rhs,
stream,
None,
output_flag,
);
ct_res.as_mut().info = ct_res
.as_ref()
.info
.after_overflowing_add(&rhs.as_ref().info);
if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
&& rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
{
carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::ZERO);
} else {
carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
}
let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);
(ct_res, ct_overflowed)
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
pub unsafe fn unchecked_signed_overflowing_add_async(
&self,
lhs: &CudaSignedRadixCiphertext,
rhs: &CudaSignedRadixCiphertext,
input_carry: Option<&CudaBooleanBlock>,
stream: &CudaStreams,
) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
let output_flag = OutputFlag::from_signedness(CudaSignedRadixCiphertext::IS_SIGNED);
let mut ct_res = lhs.duplicate_async(stream);
let mut carry_out: CudaSignedRadixCiphertext = self
.add_and_propagate_single_carry_assign_async(
&mut ct_res,
rhs,
stream,
input_carry,
output_flag,
);
ct_res.as_mut().info = ct_res
.as_ref()
@@ -655,141 +705,13 @@ impl CudaServerKey {
"inputs cannot be empty"
);
self.unchecked_signed_overflowing_add_or_sub(
ct_left,
ct_right,
SignedOperation::Addition,
stream,
)
}
pub(crate) fn unchecked_signed_overflowing_add_or_sub(
&self,
lhs: &CudaSignedRadixCiphertext,
rhs: &CudaSignedRadixCiphertext,
signed_operation: SignedOperation,
streams: &CudaStreams,
) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
assert!(self.message_modulus.0 >= 4 && self.carry_modulus.0 >= 4);
let mut result: CudaSignedRadixCiphertext;
let result;
let overflowed;
unsafe {
result = lhs.duplicate_async(streams);
}
let carry_out: CudaSignedRadixCiphertext =
unsafe { self.create_trivial_zero_radix_async(1, streams) };
let mut overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);
unsafe {
self.unchecked_signed_overflowing_add_or_sub_assign_async(
&mut result,
rhs,
&mut overflowed,
signed_operation,
streams,
);
}
streams.synchronize();
(result, overflowed) =
self.unchecked_signed_overflowing_add_async(ct_left, ct_right, None, stream);
};
stream.synchronize();
(result, overflowed)
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronized
pub(crate) unsafe fn unchecked_signed_overflowing_add_or_sub_assign_async(
&self,
lhs: &mut CudaSignedRadixCiphertext,
rhs: &CudaSignedRadixCiphertext,
overflowed: &mut CudaBooleanBlock,
signed_operation: SignedOperation,
streams: &CudaStreams,
) {
if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
&& rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
{
overflowed.as_mut().ciphertext.info = overflowed
.as_ref()
.ciphertext
.info
.boolean_info(NoiseLevel::ZERO);
} else {
overflowed.as_mut().ciphertext.info = overflowed
.as_ref()
.ciphertext
.info
.boolean_info(NoiseLevel::NOMINAL);
}
let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
let signed_operation_numeric: i8 =
if matches!(signed_operation, SignedOperation::Subtraction) {
-1
} else {
1
};
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async(
streams,
&mut lhs.as_mut().d_blocks.0.d_vec,
&rhs.as_ref().d_blocks.0.d_vec,
&mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
signed_operation_numeric,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
num_blocks,
PBSType::Classical,
LweBskGroupingFactor(0),
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async(
streams,
&mut lhs.as_mut().d_blocks.0.d_vec,
&rhs.as_ref().d_blocks.0.d_vec,
&mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
signed_operation_numeric,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
num_blocks,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
);
}
};
lhs.as_mut().info = lhs
.as_ref()
.info
.after_overflowing_add(&rhs.ciphertext.info);
}
}

View File

@@ -90,7 +90,6 @@ impl CudaServerKey {
&d_decomposed_scalar,
streams,
);
ct.as_mut().info = ct.as_ref().info.after_bitnot();
}
pub fn unchecked_bitnot_assign<T: CudaIntegerRadixCiphertext>(

View File

@@ -6,6 +6,7 @@ use crate::core_crypto::prelude::{
ContiguousEntityContainerMut, LweBskGroupingFactor, LweCiphertextCount,
};
use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
use crate::integer::gpu::ciphertext::{
CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
@@ -13,10 +14,11 @@ use crate::integer::gpu::ciphertext::{
};
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
apply_many_univariate_lut_kb_async, apply_univariate_lut_kb_async, full_propagate_assign_async,
propagate_single_carry_assign_async, propagate_single_carry_get_input_carries_assign_async,
CudaServerKey, PBSType,
add_and_propagate_single_carry_assign_async, apply_many_univariate_lut_kb_async,
apply_univariate_lut_kb_async, full_propagate_assign_async,
propagate_single_carry_assign_async, CudaServerKey, PBSType,
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::shortint::ciphertext::{Degree, NoiseLevel};
use crate::shortint::engine::{fill_accumulator, fill_many_lut_accumulator};
use crate::shortint::server_key::{
@@ -46,6 +48,8 @@ mod scalar_sub;
mod shift;
mod sub;
#[cfg(all(test, feature = "__long_run_tests"))]
mod tests_long_run;
#[cfg(test)]
mod tests_signed;
#[cfg(test)]
@@ -203,6 +207,8 @@ impl CudaServerKey {
&self,
ct: &mut T,
streams: &CudaStreams,
input_carry: Option<&CudaBooleanBlock>,
requested_flag: OutputFlag,
) -> T
where
T: CudaIntegerRadixCiphertext,
@@ -210,12 +216,20 @@ impl CudaServerKey {
let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
let ciphertext = ct.as_mut();
let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
let uses_carry = input_carry.map_or(0u32, |_block| 1u32);
let mut aux_block: T = self.create_trivial_zero_radix(1, streams);
let in_carry_dvec = input_carry.map_or_else(
|| &aux_block.as_mut().d_blocks.0.d_vec,
|block| &block.0.ciphertext.d_blocks.0.d_vec,
);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
propagate_single_carry_assign_async(
streams,
&mut ciphertext.d_blocks.0.d_vec,
&mut carry_out.as_mut().d_blocks.0.d_vec,
in_carry_dvec,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
d_bsk.input_lwe_dimension(),
@@ -230,6 +244,8 @@ impl CudaServerKey {
ciphertext.info.blocks.first().unwrap().carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
requested_flag,
uses_carry,
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
@@ -237,6 +253,7 @@ impl CudaServerKey {
streams,
&mut ciphertext.d_blocks.0.d_vec,
&mut carry_out.as_mut().d_blocks.0.d_vec,
in_carry_dvec,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
d_multibit_bsk.input_lwe_dimension(),
@@ -251,6 +268,8 @@ impl CudaServerKey {
ciphertext.info.blocks.first().unwrap().carry_modulus,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
requested_flag,
uses_carry,
);
}
};
@@ -269,26 +288,35 @@ impl CudaServerKey {
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronized
#[allow(dead_code)]
pub(crate) unsafe fn propagate_single_carry_get_input_carries_assign_async<T>(
pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T>(
&self,
ct: &mut T,
input_carries: &mut T,
lhs: &mut T,
rhs: &T,
streams: &CudaStreams,
input_carry: Option<&CudaBooleanBlock>,
requested_flag: OutputFlag,
) -> T
where
T: CudaIntegerRadixCiphertext,
{
let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
let ciphertext = ct.as_mut();
let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
let num_blocks = lhs.as_mut().d_blocks.lwe_ciphertext_count().0 as u32;
let uses_carry = input_carry.map_or(0u32, |_block| 1u32);
let mut aux_block: T = self.create_trivial_zero_radix(1, streams);
let in_carry_dvec = input_carry.map_or_else(
|| &aux_block.as_mut().d_blocks.0.d_vec,
|block| &block.0.ciphertext.d_blocks.0.d_vec,
);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
propagate_single_carry_get_input_carries_assign_async(
add_and_propagate_single_carry_assign_async(
streams,
&mut ciphertext.d_blocks.0.d_vec,
&mut lhs.as_mut().d_blocks.0.d_vec,
&rhs.as_ref().d_blocks.0.d_vec,
&mut carry_out.as_mut().d_blocks.0.d_vec,
&mut input_carries.as_mut().d_blocks.0.d_vec,
in_carry_dvec,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
d_bsk.input_lwe_dimension(),
@@ -299,18 +327,21 @@ impl CudaServerKey {
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
num_blocks,
ciphertext.info.blocks.first().unwrap().message_modulus,
ciphertext.info.blocks.first().unwrap().carry_modulus,
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
requested_flag,
uses_carry,
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
propagate_single_carry_get_input_carries_assign_async(
add_and_propagate_single_carry_assign_async(
streams,
&mut ciphertext.d_blocks.0.d_vec,
&mut lhs.as_mut().d_blocks.0.d_vec,
&rhs.as_ref().d_blocks.0.d_vec,
&mut carry_out.as_mut().d_blocks.0.d_vec,
&mut input_carries.as_mut().d_blocks.0.d_vec,
in_carry_dvec,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
d_multibit_bsk.input_lwe_dimension(),
@@ -321,14 +352,16 @@ impl CudaServerKey {
d_multibit_bsk.decomp_level_count(),
d_multibit_bsk.decomp_base_log(),
num_blocks,
ciphertext.info.blocks.first().unwrap().message_modulus,
ciphertext.info.blocks.first().unwrap().carry_modulus,
self.message_modulus,
self.carry_modulus,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
requested_flag,
uses_carry,
);
}
};
ciphertext.info.blocks.iter_mut().for_each(|b| {
lhs.as_mut().info.blocks.iter_mut().for_each(|b| {
b.degree = Degree::new(b.message_modulus.0 - 1);
b.noise_level = NoiseLevel::NOMINAL;
});

View File

@@ -1,6 +1,7 @@
use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams};
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::CudaServerKey;
use crate::integer::server_key::radix_parallel::OutputFlag;
impl CudaServerKey {
/// Homomorphically computes the opposite of a ciphertext encrypting an integer message.
@@ -144,7 +145,8 @@ impl CudaServerKey {
};
let mut res = self.unchecked_neg_async(ct, streams);
let _carry = self.propagate_single_carry_assign_async(&mut res, streams);
let _carry =
self.propagate_single_carry_assign_async(&mut res, streams, None, OutputFlag::None);
res
}
}

View File

@@ -8,6 +8,7 @@ use crate::integer::gpu::ciphertext::{
};
use crate::integer::gpu::scalar_addition_integer_radix_assign_async;
use crate::integer::gpu::server_key::CudaServerKey;
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::prelude::CastInto;
use crate::shortint::ciphertext::NoiseLevel;
@@ -186,7 +187,7 @@ impl CudaServerKey {
};
self.unchecked_scalar_add_assign_async(ct, scalar, streams);
let _carry = self.propagate_single_carry_assign_async(ct, streams);
let _carry = self.propagate_single_carry_assign_async(ct, streams, None, OutputFlag::None);
}
pub fn scalar_add_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams)
@@ -264,7 +265,8 @@ impl CudaServerKey {
self.unchecked_scalar_add_assign(ct_left, scalar, stream);
let mut carry_out;
unsafe {
carry_out = self.propagate_single_carry_assign_async(ct_left, stream);
carry_out =
self.propagate_single_carry_assign_async(ct_left, stream, None, OutputFlag::Carry);
}
stream.synchronize();

View File

@@ -4,6 +4,7 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext};
use crate::integer::gpu::server_key::CudaServerKey;
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::integer::server_key::TwosComplementNegation;
use crate::prelude::CastInto;
@@ -151,7 +152,7 @@ impl CudaServerKey {
};
self.unchecked_scalar_sub_assign_async(ct, scalar, stream);
let _carry = self.propagate_single_carry_assign_async(ct, stream);
let _carry = self.propagate_single_carry_assign_async(ct, stream, None, OutputFlag::None);
}
pub fn scalar_sub_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams)

View File

@@ -1,18 +1,17 @@
use super::add::SignedOperation;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::{CiphertextModulus, LweBskGroupingFactor, LweCiphertextCount};
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
use crate::integer::gpu::ciphertext::{
CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
CudaUnsignedRadixCiphertext,
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::server_key::CudaServerKey;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType,
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::shortint::ciphertext::NoiseLevel;
use crate::shortint::parameters::{Degree, LweBskGroupingFactor};
impl CudaServerKey {
/// Computes homomorphically a subtraction between two ciphertexts encrypting integer values.
@@ -271,8 +270,14 @@ impl CudaServerKey {
}
};
self.unchecked_sub_assign_async(lhs, rhs, streams);
let _carry = self.propagate_single_carry_assign_async(lhs, streams);
let neg_rhs = self.unchecked_neg_async(rhs, streams);
let _carry = self.add_and_propagate_single_carry_assign_async(
lhs,
&neg_rhs,
streams,
None,
OutputFlag::None,
);
}
pub fn unsigned_overflowing_sub(
@@ -353,87 +358,102 @@ impl CudaServerKey {
rhs: &CudaUnsignedRadixCiphertext,
stream: &CudaStreams,
) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) {
let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
let mut tmp: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
&& rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
{
tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::ZERO);
} else {
tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
}
let mut ct_res = lhs.duplicate_async(stream);
let block = CudaLweCiphertextList::new(
tmp.as_ref().d_blocks.lwe_dimension(),
LweCiphertextCount(1),
CiphertextModulus::new_native(),
stream,
);
let block_info = tmp.as_ref().info.blocks[0];
let ct_info = vec![block_info];
let ct_info = CudaRadixCiphertextInfo { blocks: ct_info };
let mut ct_overflowed =
CudaBooleanBlock::from_cuda_radix_ciphertext(CudaRadixCiphertext::new(block, ct_info));
let compute_overflow = true;
const INPUT_BORROW: Option<&CudaBooleanBlock> = None;
let mut overflow_block: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(1, stream);
let ciphertext = ct_res.as_mut();
let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
let uses_input_borrow = INPUT_BORROW.map_or(0u32, |_block| 1u32);
let mut aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
let in_carry_dvec = INPUT_BORROW.map_or_else(
|| &aux_block.as_mut().d_blocks.0.d_vec,
|block| &block.0.ciphertext.d_blocks.0.d_vec,
);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
stream,
&mut ct_res.as_mut().d_blocks.0.d_vec,
&mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
&lhs.as_ref().d_blocks.0.d_vec,
&mut ciphertext.d_blocks.0.d_vec,
&rhs.as_ref().d_blocks.0.d_vec,
&mut overflow_block.as_mut().d_blocks.0.d_vec,
in_carry_dvec,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
num_blocks,
ciphertext.info.blocks.first().unwrap().message_modulus,
ciphertext.info.blocks.first().unwrap().carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
compute_overflow,
uses_input_borrow,
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
stream,
&mut ct_res.as_mut().d_blocks.0.d_vec,
&mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
&lhs.as_ref().d_blocks.0.d_vec,
&mut ciphertext.d_blocks.0.d_vec,
&rhs.as_ref().d_blocks.0.d_vec,
&mut overflow_block.as_mut().d_blocks.0.d_vec,
in_carry_dvec,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
d_multibit_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.decomp_level_count(),
d_multibit_bsk.decomp_base_log(),
num_blocks,
ciphertext.info.blocks.first().unwrap().message_modulus,
ciphertext.info.blocks.first().unwrap().carry_modulus,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
compute_overflow,
uses_input_borrow,
);
}
};
ciphertext.info.blocks.iter_mut().for_each(|b| {
b.degree = Degree::new(b.message_modulus.0 - 1);
b.noise_level = NoiseLevel::NOMINAL;
});
overflow_block
.as_mut()
.info
.blocks
.iter_mut()
.for_each(|b| {
b.degree = Degree::new(1);
b.noise_level = NoiseLevel::ZERO;
});
if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
&& rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
{
overflow_block.as_mut().info =
overflow_block.as_ref().info.boolean_info(NoiseLevel::ZERO);
} else {
overflow_block.as_mut().info = overflow_block
.as_ref()
.info
.boolean_info(NoiseLevel::NOMINAL);
}
let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(overflow_block.ciphertext);
ct_res.as_mut().info = ct_res
.as_ref()
@@ -541,11 +561,34 @@ impl CudaServerKey {
ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 > 0,
"inputs cannot be empty"
);
let result;
let overflowed;
unsafe {
(result, overflowed) =
self.unchecked_signed_overflowing_sub_async(ct_left, ct_right, stream);
};
stream.synchronize();
(result, overflowed)
}
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until stream is synchronised
pub unsafe fn unchecked_signed_overflowing_sub_async(
&self,
ct_left: &CudaSignedRadixCiphertext,
ct_right: &CudaSignedRadixCiphertext,
stream: &CudaStreams,
) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
let flipped_rhs = self.bitnot(ct_right, stream);
let ct_input_carry: CudaUnsignedRadixCiphertext =
self.create_trivial_radix_async(1, 1, stream);
let input_carry = CudaBooleanBlock::from_cuda_radix_ciphertext(ct_input_carry.ciphertext);
self.unchecked_signed_overflowing_add_or_sub(
self.unchecked_signed_overflowing_add_async(
ct_left,
ct_right,
SignedOperation::Subtraction,
&flipped_rhs,
Some(&input_carry),
stream,
)
}

Some files were not shown because too many files have changed in this diff Show More