mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
76 Commits
al/fix_gpu
...
al/debug_s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
071934532e | ||
|
|
87b41ebbe5 | ||
|
|
7ba7dd3e19 | ||
|
|
7f122bb435 | ||
|
|
56c0811751 | ||
|
|
580ac5aa99 | ||
|
|
cf34e9b7f0 | ||
|
|
856fc1a709 | ||
|
|
fe0a195630 | ||
|
|
aca7e79585 | ||
|
|
c0e89a53ef | ||
|
|
312952007f | ||
|
|
ff51ed3f34 | ||
|
|
9737bdcb98 | ||
|
|
87a43a4900 | ||
|
|
345bdbf17f | ||
|
|
cc54ba2236 | ||
|
|
11df6c69ee | ||
|
|
b76f4dbfe0 | ||
|
|
be21c15c80 | ||
|
|
aa51b25313 | ||
|
|
300c95fe3d | ||
|
|
524adda8f6 | ||
|
|
dedcf205b4 | ||
|
|
2c8d4c0fb0 | ||
|
|
3370fb5b7e | ||
|
|
cd77eac42b | ||
|
|
40f20b4ecb | ||
|
|
59a78c76a9 | ||
|
|
1025246b17 | ||
|
|
338e9eaeef | ||
|
|
0bec4d2ba1 | ||
|
|
c5fab98900 | ||
|
|
14e1ee5bd3 | ||
|
|
52bc778629 | ||
|
|
10405c9836 | ||
|
|
5eaf6cec55 | ||
|
|
3bfacc1e9d | ||
|
|
a47a418d41 | ||
|
|
75b3141e19 | ||
|
|
d01328e0fe | ||
|
|
6e102b5fa1 | ||
|
|
8aa6fa514e | ||
|
|
21a19cd3c5 | ||
|
|
f51c70d536 | ||
|
|
66e3c02838 | ||
|
|
408e81c45a | ||
|
|
4152906c5d | ||
|
|
9fc8a0b5bc | ||
|
|
5dc3e59d13 | ||
|
|
b40996a7e5 | ||
|
|
b066ef19fa | ||
|
|
25d008bae8 | ||
|
|
2749c1088c | ||
|
|
c19cd9f021 | ||
|
|
45fdba04b1 | ||
|
|
69d46810b8 | ||
|
|
a16eeb983f | ||
|
|
8278a9373c | ||
|
|
e2a2768484 | ||
|
|
57cfc38b66 | ||
|
|
259d125434 | ||
|
|
2571196b41 | ||
|
|
9f3dc6167d | ||
|
|
59c17692a3 | ||
|
|
e29d615b9d | ||
|
|
8caff604ed | ||
|
|
16badf0c00 | ||
|
|
99a27c1cbe | ||
|
|
9131aaa383 | ||
|
|
a01949e630 | ||
|
|
30a58cdd1a | ||
|
|
03325bf94e | ||
|
|
786fe66495 | ||
|
|
9ee8259002 | ||
|
|
a7d8d2b1d4 |
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@@ -6,6 +6,7 @@ self-hosted-runner:
|
||||
- large_windows_16_latest
|
||||
- large_ubuntu_16
|
||||
- large_ubuntu_16-22.04
|
||||
- v80-desktop
|
||||
# Configuration variables in array of strings defined in your repository or
|
||||
# organization. `null` means disabling configuration variables check.
|
||||
# Empty array means no configuration variable is allowed.
|
||||
|
||||
4
.github/actions/gpu_setup/action.yml
vendored
4
.github/actions/gpu_setup/action.yml
vendored
@@ -33,7 +33,9 @@ runs:
|
||||
if: inputs.github-instance == 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
TOOLKIT_VERSION="$(echo ${CUDA_VERSION} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
|
||||
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
|
||||
# shellcheck disable=SC2001
|
||||
TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${env.CUDA_KEYRING_PACKAGE}
|
||||
echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
|
||||
sha256sum -c checksum
|
||||
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -126,9 +126,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
6
.github/workflows/aws_tfhe_fast_tests.yml
vendored
6
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -174,7 +174,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -182,6 +182,7 @@ jobs:
|
||||
if: needs.should-run.outputs.csprng_test == 'true'
|
||||
run: |
|
||||
make test_tfhe_csprng
|
||||
make test_tfhe_csprng_big_endian
|
||||
|
||||
- name: Run tfhe-zk-pok tests
|
||||
if: needs.should-run.outputs.zk_pok_test == 'true'
|
||||
@@ -272,9 +273,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
|
||||
|
||||
5
.github/workflows/aws_tfhe_integer_tests.yml
vendored
5
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -114,7 +114,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -142,9 +142,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
@@ -115,7 +115,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -147,9 +147,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
5
.github/workflows/aws_tfhe_tests.yml
vendored
5
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -185,7 +185,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -254,9 +254,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
5
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
5
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -123,9 +123,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
14
.github/workflows/benchmark_boolean.yml
vendored
14
.github/workflows/benchmark_boolean.yml
vendored
@@ -58,14 +58,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -114,8 +117,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
14
.github/workflows/benchmark_core_crypto.yml
vendored
14
.github/workflows/benchmark_core_crypto.yml
vendored
@@ -58,14 +58,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -107,8 +110,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
14
.github/workflows/benchmark_dex.yml
vendored
14
.github/workflows/benchmark_dex.yml
vendored
@@ -58,14 +58,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -116,8 +119,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
14
.github/workflows/benchmark_erc20.yml
vendored
14
.github/workflows/benchmark_erc20.yml
vendored
@@ -59,14 +59,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -111,8 +114,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
30
.github/workflows/benchmark_gpu_4090.yml
vendored
30
.github/workflows/benchmark_gpu_4090.yml
vendored
@@ -46,15 +46,18 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
echo "FAST_BENCH=TRUE";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -93,8 +96,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
@@ -124,14 +130,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -170,8 +179,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
44
.github/workflows/benchmark_gpu_common.yml
vendored
44
.github/workflows/benchmark_gpu_common.yml
vendored
@@ -84,7 +84,7 @@ jobs:
|
||||
run: |
|
||||
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
|
||||
# shellcheck disable=SC2001
|
||||
PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
|
||||
PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\", \"/g')
|
||||
echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set single operations flavor
|
||||
@@ -120,26 +120,33 @@ jobs:
|
||||
env:
|
||||
INPUTS_PARAMS_TYPE: ${{ inputs.params_type }}
|
||||
|
||||
|
||||
- name: Set command output
|
||||
id: set_command
|
||||
run: |
|
||||
echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "command=${COMMAND_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
COMMAND_OUTPUT: ${{ toJSON(env.COMMAND) }}
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: |
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
- name: Set parameters types output
|
||||
id: set_params_type
|
||||
run: |
|
||||
echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "params_type=${PARAMS_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
PARAMS_TYPE_OUTPUT: ${{ toJSON(env.PARAMS_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
|
||||
@@ -227,6 +234,8 @@ jobs:
|
||||
include:
|
||||
- cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
@@ -237,18 +246,20 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
# Re-export environment variables as dependencies setup perform this task in the previous job.
|
||||
# Local env variables are cleaned at the end of each job.
|
||||
- name: Export CUDA variables
|
||||
shell: bash
|
||||
run: |
|
||||
CUDA_PATH=/usr/local/cuda-${{ matrix.cuda }}
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
@@ -258,13 +269,15 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -317,8 +330,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
|
||||
14
.github/workflows/benchmark_gpu_dex_common.yml
vendored
14
.github/workflows/benchmark_gpu_dex_common.yml
vendored
@@ -119,14 +119,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -167,8 +170,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
|
||||
14
.github/workflows/benchmark_gpu_erc20_common.yml
vendored
14
.github/workflows/benchmark_gpu_erc20_common.yml
vendored
@@ -120,14 +120,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -168,8 +171,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
|
||||
94
.github/workflows/benchmark_hpu_integer.yml
vendored
Normal file
94
.github/workflows/benchmark_hpu_integer.yml
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
|
||||
name: Hpu Integer Benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
integer-benchmarks-hpu:
|
||||
name: Execute integer & erc20 benchmarks for HPU backend
|
||||
runs-on: v80-desktop
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
steps:
|
||||
# Needed as long as hw_regmap repository is private
|
||||
- name: Configure SSH
|
||||
uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
|
||||
with:
|
||||
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make bench_integer_hpu
|
||||
make bench_hlapi_erc20_hpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
|
||||
--database tfhe_rs \
|
||||
--hardware "hpu_x1" \
|
||||
--backend hpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch "${REF_NAME}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${BENCH_DATE}" \
|
||||
--walk-subdirs
|
||||
env:
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_benchmarks
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
22
.github/workflows/benchmark_integer.yml
vendored
22
.github/workflows/benchmark_integer.yml
vendored
@@ -79,12 +79,16 @@ jobs:
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: |
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (integer-benchmarks)
|
||||
@@ -128,14 +132,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -193,8 +200,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
18
.github/workflows/benchmark_shortint.yml
vendored
18
.github/workflows/benchmark_shortint.yml
vendored
@@ -48,7 +48,9 @@ jobs:
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (shortint-benchmarks)
|
||||
@@ -89,14 +91,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -150,8 +155,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
22
.github/workflows/benchmark_signed_integer.yml
vendored
22
.github/workflows/benchmark_signed_integer.yml
vendored
@@ -79,12 +79,16 @@ jobs:
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: |
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-benchmarks)
|
||||
@@ -128,14 +132,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -185,8 +192,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
12
.github/workflows/benchmark_tfhe_fft.yml
vendored
12
.github/workflows/benchmark_tfhe_fft.yml
vendored
@@ -61,11 +61,14 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
@@ -107,8 +110,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
12
.github/workflows/benchmark_tfhe_ntt.yml
vendored
12
.github/workflows/benchmark_tfhe_ntt.yml
vendored
@@ -61,11 +61,14 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
@@ -107,8 +110,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
14
.github/workflows/benchmark_tfhe_zk_pok.yml
vendored
14
.github/workflows/benchmark_tfhe_zk_pok.yml
vendored
@@ -98,14 +98,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -155,8 +158,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
24
.github/workflows/benchmark_wasm_client.yml
vendored
24
.github/workflows/benchmark_wasm_client.yml
vendored
@@ -96,14 +96,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -136,12 +139,16 @@ jobs:
|
||||
|
||||
- name: Install web resources
|
||||
run: |
|
||||
make install_${{ matrix.browser }}_browser
|
||||
make install_${{ matrix.browser }}_web_driver
|
||||
make install_"${BROWSER}"_browser
|
||||
make install_"${BROWSER}"_web_driver
|
||||
env:
|
||||
BROWSER: ${{ matrix.browser }}
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make bench_web_js_api_parallel_${{ matrix.browser }}_ci
|
||||
make bench_web_js_api_parallel_"${BROWSER}"_ci
|
||||
env:
|
||||
BROWSER: ${{ matrix.browser }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -188,8 +195,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
18
.github/workflows/benchmark_zk_pke.yml
vendored
18
.github/workflows/benchmark_zk_pke.yml
vendored
@@ -93,7 +93,9 @@ jobs:
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: |
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (pke-zk-benchmarks)
|
||||
@@ -140,14 +142,17 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_DATE=${COMMIT_DATE}";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
SHA: ${{ github.sha }}
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -205,8 +210,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
|
||||
--slab-url "${SLAB_URL}"
|
||||
env:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
|
||||
7
.github/workflows/cargo_build.yml
vendored
7
.github/workflows/cargo_build.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -94,5 +94,10 @@ jobs:
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
- name: Run Hpu pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make pcc_hpu
|
||||
|
||||
# The wasm build check is a bit annoying to set-up here and is done during the tests in
|
||||
# aws_tfhe_tests.yml
|
||||
|
||||
4
.github/workflows/cargo_test_fft.yml
vendored
4
.github/workflows/cargo_test_fft.yml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
runs-on: ${{ matrix.runner_type }}
|
||||
strategy:
|
||||
matrix:
|
||||
runner_type: [ubuntu-latest, macos-latest, windows-latest]
|
||||
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
runs-on: ${{ matrix.runner_type }}
|
||||
strategy:
|
||||
matrix:
|
||||
runner_type: [ubuntu-latest, macos-latest, windows-latest]
|
||||
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
|
||||
4
.github/workflows/cargo_test_ntt.yml
vendored
4
.github/workflows/cargo_test_ntt.yml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
os: [ ubuntu-latest, macos-latest, windows-latest ]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
os: [ ubuntu-latest, macos-latest, windows-latest ]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
|
||||
10
.github/workflows/ci_lint.yml
vendored
10
.github/workflows/ci_lint.yml
vendored
@@ -25,10 +25,10 @@ jobs:
|
||||
|
||||
- name: Get actionlint
|
||||
run: |
|
||||
wget "https://github.com/rhysd/actionlint/releases/download/v${{ env.ACTIONLINT_VERSION }}/actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz"
|
||||
echo "${{ env.ACTIONLINT_CHECKSUM }} actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz" > checksum
|
||||
wget "https://github.com/rhysd/actionlint/releases/download/v${ACTIONLINT_VERSION}/actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz"
|
||||
echo "${ACTIONLINT_CHECKSUM} actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz" > checksum
|
||||
sha256sum -c checksum
|
||||
tar -xf actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz actionlint
|
||||
tar -xf actionlint_"${ACTIONLINT_VERSION}"_linux_amd64.tar.gz actionlint
|
||||
ln -s "$(pwd)/actionlint" /usr/local/bin/
|
||||
|
||||
- name: Lint workflows
|
||||
@@ -38,9 +38,11 @@ jobs:
|
||||
- name: Check workflows security
|
||||
run: |
|
||||
make check_workflow_security
|
||||
env:
|
||||
GH_TOKEN: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Ensure SHA pinned actions
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@4830be28ce81da52ec70d65c552a7403821d98d4 # v3.0.23
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@fc87bb5b5a97953d987372e74478de634726b3e5 # v3.0.25
|
||||
with:
|
||||
allowlist: |
|
||||
slsa-framework/slsa-github-generator
|
||||
|
||||
6
.github/workflows/code_coverage.yml
vendored
6
.github/workflows/code_coverage.yml
vendored
@@ -54,7 +54,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
|
||||
uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@@ -104,7 +104,7 @@ jobs:
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
|
||||
uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
13
.github/workflows/data_pr_close.yml
vendored
13
.github/workflows/data_pr_close.yml
vendored
@@ -59,7 +59,7 @@ jobs:
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${COMMENTS_URL}" \
|
||||
-d "${BODY}"
|
||||
@@ -71,6 +71,7 @@ jobs:
|
||||
REPO: ${{ github.repository }}
|
||||
EVENT_NUMBER: ${{ github.event.number }}
|
||||
COMMENTS_URL: ${{ fromJson(env.TARGET_REPO_PR).comments_url }}
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Merge the Pull Request in the data repo
|
||||
if: ${{ github.event.pull_request.merged }}
|
||||
@@ -81,7 +82,7 @@ jobs:
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PUT \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${TARGET_REPO_PR_URL}"/merge \
|
||||
-d '{ "merge_method": "rebase" }'
|
||||
@@ -91,6 +92,7 @@ jobs:
|
||||
exit $RES
|
||||
env:
|
||||
TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Close the Pull Request in the data repo
|
||||
if: ${{ !github.event.pull_request.merged }}
|
||||
@@ -101,7 +103,7 @@ jobs:
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PATCH \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${TARGET_REPO_PR_URL}" \
|
||||
-d '{ "state": "closed" }'
|
||||
@@ -111,6 +113,7 @@ jobs:
|
||||
exit $RES
|
||||
env:
|
||||
TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Delete the associated branch in the data repo
|
||||
run: |
|
||||
@@ -120,13 +123,15 @@ jobs:
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X DELETE \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"${TARGET_REPO_API_URL}"/git/refs/heads/"${PR_BRANCH}"
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
env:
|
||||
TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() && job.status == 'failure' }}
|
||||
|
||||
2
.github/workflows/gpu_4090_tests.yml
vendored
2
.github/workflows/gpu_4090_tests.yml
vendored
@@ -45,7 +45,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
5
.github/workflows/gpu_fast_h100_tests.yml
vendored
5
.github/workflows/gpu_fast_h100_tests.yml
vendored
@@ -140,7 +140,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -172,9 +172,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
5
.github/workflows/gpu_fast_tests.yml
vendored
5
.github/workflows/gpu_fast_tests.yml
vendored
@@ -124,7 +124,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -156,9 +156,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
2
.github/workflows/gpu_full_h100_tests.yml
vendored
2
.github/workflows/gpu_full_h100_tests.yml
vendored
@@ -79,7 +79,7 @@ jobs:
|
||||
gcc-version: ${{ matrix.gcc }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -149,7 +149,7 @@ jobs:
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=FALSE make test_high_level_api_gpu
|
||||
make test_high_level_api_gpu
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -161,9 +161,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
gcc-version: ${{ matrix.gcc }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
29
.github/workflows/gpu_pcc.yml
vendored
29
.github/workflows/gpu_pcc.yml
vendored
@@ -1,4 +1,4 @@
|
||||
# Perfom tfhe-cuda-backend post-commit checks on an AWS instance
|
||||
# Perform tfhe-cuda-backend post-commit checks on an AWS instance
|
||||
name: Cuda - Post-commit Checks
|
||||
|
||||
env:
|
||||
@@ -81,16 +81,20 @@ jobs:
|
||||
if: env.SECRETS_AVAILABLE == 'false'
|
||||
shell: bash
|
||||
run: |
|
||||
TOOLKIT_VERSION="$(echo ${{ matrix.cuda }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
|
||||
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
|
||||
# shellcheck disable=SC2001
|
||||
TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
|
||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"${CUDA_KEYRING_PACKAGE}"
|
||||
echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
|
||||
sha256sum -c checksum
|
||||
sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
|
||||
sudo apt update
|
||||
sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
|
||||
env:
|
||||
CUDA_VERSION: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -100,17 +104,21 @@ jobs:
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
env:
|
||||
CUDA_VERSION: ${{ matrix.cuda }}
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
@@ -120,12 +128,17 @@ jobs:
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Check build with hpu enabled
|
||||
run: |
|
||||
make clippy_gpu_hpu
|
||||
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -144,9 +144,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
@@ -140,7 +140,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -158,9 +158,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -156,9 +156,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -144,9 +144,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
@@ -140,7 +140,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -158,9 +158,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -156,9 +156,10 @@ jobs:
|
||||
- name: Set pull-request URL
|
||||
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
|
||||
run: |
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
|
||||
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
|
||||
env:
|
||||
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
- name: Send message
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
|
||||
73
.github/workflows/hpu_hlapi_tests.yml
vendored
Normal file
73
.github/workflows/hpu_hlapi_tests.yml
vendored
Normal file
@@ -0,0 +1,73 @@
|
||||
# Test tfhe-fft
|
||||
name: Cargo Test HLAPI HPU
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
|
||||
permissions: { }
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read
|
||||
outputs:
|
||||
hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
|
||||
with:
|
||||
files_yaml: |
|
||||
hpu:
|
||||
- tfhe/Cargo.toml
|
||||
- Makefile
|
||||
- backends/tfhe-hpu-backend/**
|
||||
- mockups/tfhe-hpu-mockup/**
|
||||
|
||||
cargo-tests-hpu:
|
||||
needs: should-run
|
||||
if: needs.should-run.outputs.hpu_test == 'true'
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Install Just
|
||||
run: |
|
||||
cargo install just
|
||||
|
||||
- name: Test HLAPI HPU
|
||||
run: |
|
||||
source setup_hpu.sh
|
||||
just -f mockups/tfhe-hpu-mockup/Justfile BUILD_PROFILE=release mockup &
|
||||
make HPU_CONFIG=sim test_high_level_api_hpu
|
||||
make HPU_CONFIG=sim test_user_doc_hpu
|
||||
2
.github/workflows/integer_long_run_tests.yml
vendored
2
.github/workflows/integer_long_run_tests.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
2
.github/workflows/m1_tests.yml
vendored
2
.github/workflows/m1_tests.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
26
.github/workflows/make_release_cuda.yml
vendored
26
.github/workflows/make_release_cuda.yml
vendored
@@ -67,7 +67,7 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -78,17 +78,19 @@ jobs:
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
CUDA_VERSION: ${{ matrix.cuda }}
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
- name: Prepare package
|
||||
@@ -129,7 +131,7 @@ jobs:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
|
||||
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -140,19 +142,23 @@ jobs:
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
CUDA_VERSION: ${{ matrix.cuda }}
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
env:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
|
||||
105
.github/workflows/make_release_hpu.yml
vendored
Normal file
105
.github/workflows/make_release_hpu.yml
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
name: Publish HPU release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify_tag:
|
||||
uses: ./.github/workflows/verify_tagged_commit.yml
|
||||
secrets:
|
||||
RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package:
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify_tag
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-hpu-backend
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [package]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
# Needed to create the provenance via GitHub OIDC
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish_release:
|
||||
name: Publish tfhe-hpu-backend Release
|
||||
runs-on: ubuntu-latest
|
||||
needs: [verify_tag, package] # for comparing hashes
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
2
.lfsconfig
Normal file
2
.lfsconfig
Normal file
@@ -0,0 +1,2 @@
|
||||
[lfs]
|
||||
fetchexclude = *
|
||||
@@ -10,3 +10,9 @@
|
||||
/tfhe/src/integer/gpu
|
||||
|
||||
/tfhe/src/high_level_api/ @tmontaigu
|
||||
|
||||
/Makefile @IceTDrinker @soonum
|
||||
|
||||
/.github/ @soonum
|
||||
|
||||
/CODEOWNERS @IceTDrinker
|
||||
|
||||
@@ -9,10 +9,12 @@ members = [
|
||||
"tasks",
|
||||
"tfhe-csprng",
|
||||
"backends/tfhe-cuda-backend",
|
||||
"backends/tfhe-hpu-backend",
|
||||
"utils/tfhe-versionable",
|
||||
"utils/tfhe-versionable-derive",
|
||||
"utils/param_dedup",
|
||||
"tests",
|
||||
"mockups/tfhe-hpu-mockup",
|
||||
]
|
||||
|
||||
exclude = [
|
||||
|
||||
120
Makefile
120
Makefile
@@ -2,6 +2,7 @@ SHELL:=$(shell /usr/bin/env which bash)
|
||||
OS:=$(shell uname)
|
||||
RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
|
||||
CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
|
||||
CARGO_BUILD_JOBS=default
|
||||
CPU_COUNT=$(shell ./scripts/cpu_count.sh)
|
||||
RS_BUILD_TOOLCHAIN:=stable
|
||||
CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
|
||||
@@ -55,6 +56,9 @@ REGEX_PATTERN?=''
|
||||
TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
|
||||
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
|
||||
|
||||
# tfhe-hpu-backend
|
||||
HPU_CONFIG=v80
|
||||
|
||||
# Exclude these files from coverage reports
|
||||
define COVERAGE_EXCLUDED_FILES
|
||||
--exclude-files apps/trivium/src/trivium/* \
|
||||
@@ -166,9 +170,13 @@ install_typos_checker: install_rs_build_toolchain
|
||||
.PHONY: install_zizmor # Install zizmor workflow security checker
|
||||
install_zizmor: install_rs_build_toolchain
|
||||
@zizmor --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
|
||||
( echo "Unable to install zizmor, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: install_cargo_cross # Install custom tfhe-rs lints
|
||||
install_cargo_cross: install_rs_build_toolchain
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
|
||||
|
||||
.PHONY: setup_venv # Setup Python virtualenv for wasm tests
|
||||
setup_venv:
|
||||
python3 -m venv venv
|
||||
@@ -290,7 +298,7 @@ check_typos: install_typos_checker
|
||||
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
@@ -301,6 +309,20 @@ check_gpu: install_rs_check_toolchain
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
|
||||
clippy_hpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
|
||||
clippy_gpu_hpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
|
||||
fix_newline: check_linelint_installed
|
||||
linelint -a .
|
||||
@@ -473,6 +495,11 @@ clippy_cuda_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-cuda-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
|
||||
clippy_hpu_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-hpu-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
|
||||
check_rust_bindings_did_not_change:
|
||||
cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
|
||||
@@ -702,6 +729,28 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
|
||||
test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
|
||||
|
||||
.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
|
||||
test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
source ./setup_hpu.sh --config sim ; \
|
||||
cargo build --release --bin hpu_mockup; \
|
||||
coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
|
||||
HPU_TEST_ITER=1 \
|
||||
cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
|
||||
kill %1
|
||||
|
||||
.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
|
||||
test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
|
||||
source ./setup_hpu.sh --config sim ; \
|
||||
cargo build --profile devo --bin hpu_mockup; \
|
||||
coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
|
||||
HPU_TEST_ITER=1 \
|
||||
cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
|
||||
kill %1
|
||||
|
||||
.PHONY: test_boolean # Run the tests of the boolean module
|
||||
test_boolean: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -854,9 +903,25 @@ test_high_level_api: install_rs_build_toolchain
|
||||
|
||||
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
|
||||
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
|
||||
-E "test(/high_level_api::.*gpu.*/)"
|
||||
|
||||
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
|
||||
ifeq ($(HPU_CONFIG), v80)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
--build-jobs=$(CARGO_BUILD_JOBS) \
|
||||
--test-threads=1 \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
|
||||
-E "test(/high_level_api::.*hpu.*/)"
|
||||
else
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
--build-jobs=$(CARGO_BUILD_JOBS) \
|
||||
--test-threads=1 \
|
||||
--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
|
||||
-E "test(/high_level_api::.*hpu.*/)"
|
||||
endif
|
||||
|
||||
|
||||
.PHONY: test_strings # Run the tests for strings ci
|
||||
test_strings: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -874,9 +939,21 @@ test_user_doc: install_rs_build_toolchain
|
||||
.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
|
||||
test_user_doc_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
|
||||
--features=internal-keycache,integer,zk-pok,gpu -p $(TFHE_SPEC) \
|
||||
-- test_user_docs::
|
||||
|
||||
.PHONY: test_user_doc_hpu # Run tests for HPU from the .md documentation
|
||||
test_user_doc_hpu: install_rs_build_toolchain
|
||||
ifeq ($(HPU_CONFIG), v80)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
|
||||
--features=internal-keycache,integer,hpu,hpu-v80 -p $(TFHE_SPEC) \
|
||||
-- test_user_docs::
|
||||
else
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
|
||||
--features=internal-keycache,integer,hpu -p $(TFHE_SPEC) \
|
||||
-- test_user_docs::
|
||||
endif
|
||||
|
||||
|
||||
|
||||
.PHONY: test_regex_engine # Run tests for regex_engine example
|
||||
@@ -907,6 +984,12 @@ test_tfhe_csprng: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-csprng
|
||||
|
||||
.PHONY: test_tfhe_csprng_big_endian # Run tfhe-csprng tests on an emulated big endian system
|
||||
test_tfhe_csprng_big_endian: install_rs_build_toolchain install_cargo_cross
|
||||
RUSTFLAGS="" cross $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
|
||||
|
||||
|
||||
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
|
||||
test_zk_pok: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -1012,7 +1095,7 @@ check_compile_tests: install_rs_build_toolchain
|
||||
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
|
||||
check_compile_tests_benches_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=experimental,boolean,shortint,integer,internal-keycache,gpu \
|
||||
--features=experimental,boolean,shortint,integer,internal-keycache,gpu,zk-pok \
|
||||
-p $(TFHE_SPEC)
|
||||
mkdir -p "$(TFHECUDA_BUILD)" && \
|
||||
cd "$(TFHECUDA_BUILD)" && \
|
||||
@@ -1100,6 +1183,12 @@ clippy_bench_gpu: install_rs_check_toolchain
|
||||
--features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
|
||||
-p tfhe-benchmark -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark
|
||||
clippy_bench_hpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=hpu,shortint,integer,internal-keycache,pbs-stats\
|
||||
-p tfhe-benchmark -- --no-deps -D warnings
|
||||
|
||||
.PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
|
||||
print_doc_bench_parameters:
|
||||
RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
|
||||
@@ -1133,6 +1222,14 @@ bench_signed_integer_gpu: install_rs_check_toolchain
|
||||
--bench integer-signed-bench \
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
|
||||
bench_integer_hpu: install_rs_check_toolchain
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
|
||||
|
||||
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
||||
bench_integer_compression: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
@@ -1146,7 +1243,7 @@ bench_integer_compression_gpu: install_rs_check_toolchain
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
|
||||
.PHONY: bench_integer_zk_gpu
|
||||
bench_integer_zk_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
@@ -1324,6 +1421,14 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
|
||||
--bench hlapi-dex \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
|
||||
bench_hlapi_erc20_hpu: install_rs_check_toolchain
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
|
||||
|
||||
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
|
||||
bench_tfhe_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
@@ -1423,6 +1528,9 @@ tfhe_lints
|
||||
pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
|
||||
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
|
||||
|
||||
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
|
||||
pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
|
||||
check_md_docs_are_tested clippy_fast check_compile_tests
|
||||
|
||||
@@ -11,11 +11,13 @@ extend-ignore-identifiers-re = [
|
||||
# Example with string replacing "hello" with "herlo"
|
||||
"herlo",
|
||||
# Example in trivium
|
||||
"C9217BA0D762ACA1"
|
||||
"C9217BA0D762ACA1",
|
||||
"0x[0-9a-fA-F]+"
|
||||
]
|
||||
|
||||
[files]
|
||||
extend-exclude = [
|
||||
"backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
|
||||
"backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
|
||||
"backends/tfhe-hpu-backend/config_store/**/*.link_summary",
|
||||
]
|
||||
|
||||
@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
|
||||
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus);
|
||||
void const *lwe_array_in, void const *lwe_array_indexes,
|
||||
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
|
||||
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
|
||||
uint32_t log_modulus);
|
||||
|
||||
void cuda_glwe_sample_extract_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
|
||||
@@ -400,7 +400,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
@@ -538,5 +539,13 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams, uint32_t const *gpu_indexes);
|
||||
|
||||
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams,
|
||||
uint32_t const *gpu_indexes);
|
||||
} // extern C
|
||||
#endif // CUDA_INTEGER_H
|
||||
|
||||
@@ -1116,18 +1116,116 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
CudaRadixCiphertextFFI *new_blocks;
|
||||
CudaRadixCiphertextFFI *new_blocks_copy;
|
||||
CudaRadixCiphertextFFI *old_blocks;
|
||||
CudaRadixCiphertextFFI *small_lwe_vector;
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
int32_t *d_smart_copy_in;
|
||||
int32_t *d_smart_copy_out;
|
||||
|
||||
bool mem_reuse = false;
|
||||
size_t max_total_blocks_in_vec;
|
||||
uint32_t num_blocks_in_radix;
|
||||
uint32_t max_num_radix_in_vec;
|
||||
uint64_t *size_tracker;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
// temporary buffers
|
||||
CudaRadixCiphertextFFI *current_blocks;
|
||||
CudaRadixCiphertextFFI *small_lwe_vector;
|
||||
|
||||
uint32_t *d_columns_data;
|
||||
uint32_t *d_columns_counter;
|
||||
uint32_t **d_columns;
|
||||
|
||||
uint32_t *d_new_columns_data;
|
||||
uint32_t *d_new_columns_counter;
|
||||
uint32_t **d_new_columns;
|
||||
|
||||
uint64_t *d_degrees;
|
||||
uint32_t *d_pbs_counters;
|
||||
|
||||
// lookup table for extracting message and carry
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
|
||||
bool mem_reuse = false;
|
||||
|
||||
void setup_index_buffers(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
|
||||
d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, gpu_memory_allocated);
|
||||
|
||||
d_pbs_counters = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
3 * sizeof(uint32_t), streams[0], gpu_indexes[0], size_tracker,
|
||||
gpu_memory_allocated);
|
||||
|
||||
auto num_blocks_in_radix = this->num_blocks_in_radix;
|
||||
auto max_num_radix_in_vec = this->max_num_radix_in_vec;
|
||||
auto setup_columns =
|
||||
[num_blocks_in_radix, max_num_radix_in_vec, streams,
|
||||
gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
|
||||
uint32_t *&columns_counter, uint64_t *size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0], size_tracker, gpu_memory_allocated);
|
||||
columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * sizeof(uint32_t), streams[0],
|
||||
gpu_indexes[0], size_tracker, gpu_memory_allocated);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0], gpu_memory_allocated);
|
||||
uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
|
||||
for (int i = 0; i < num_blocks_in_radix; ++i) {
|
||||
h_columns[i] = columns_data + i * max_num_radix_in_vec;
|
||||
}
|
||||
columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * sizeof(uint32_t *), streams[0],
|
||||
gpu_indexes[0], size_tracker, gpu_memory_allocated);
|
||||
cuda_memcpy_async_to_gpu(columns, h_columns,
|
||||
num_blocks_in_radix * sizeof(uint32_t *),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete[] h_columns;
|
||||
};
|
||||
|
||||
setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
|
||||
gpu_memory_allocated);
|
||||
setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
|
||||
size_tracker, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
void setup_lookup_tables(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count) {
|
||||
uint32_t message_modulus = params.message_modulus;
|
||||
|
||||
if (!mem_reuse) {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 2, max_total_blocks_in_vec,
|
||||
gpu_memory_allocated, size_tracker);
|
||||
}
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc,
|
||||
luts_message_carry->get_degree(0),
|
||||
luts_message_carry->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_message, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc,
|
||||
luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
@@ -1136,103 +1234,84 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
this->params = params;
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->mem_reuse = false;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
this->num_blocks_in_radix = num_blocks_in_radix;
|
||||
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
||||
this->gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->size_tracker = size_tracker;
|
||||
|
||||
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
setup_index_buffers(streams, gpu_indexes);
|
||||
setup_lookup_tables(streams, gpu_indexes, gpu_count);
|
||||
|
||||
// allocate gpu memory for intermediate buffers
|
||||
new_blocks = new CudaRadixCiphertextFFI;
|
||||
// create and allocate intermediate buffers
|
||||
current_blocks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
new_blocks_copy = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
old_blocks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
|
||||
streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
|
||||
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
}
|
||||
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
|
||||
CudaRadixCiphertextFFI *old_blocks,
|
||||
CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
|
||||
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
|
||||
CudaRadixCiphertextFFI *small_lwe_vector,
|
||||
int_radix_lut<Torus> *reused_lut, bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
mem_reuse = true;
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->mem_reuse = true;
|
||||
this->params = params;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
this->num_blocks_in_radix = num_blocks_in_radix;
|
||||
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
||||
this->gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->size_tracker = size_tracker;
|
||||
|
||||
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
|
||||
// assign gpu memory for intermediate buffers
|
||||
this->new_blocks = new_blocks;
|
||||
this->old_blocks = old_blocks;
|
||||
this->current_blocks = current_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
new_blocks_copy = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
this->luts_message_carry = reused_lut;
|
||||
setup_index_buffers(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
|
||||
cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_pbs_counters, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
|
||||
|
||||
cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
|
||||
cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
|
||||
if (!mem_reuse) {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
small_lwe_vector, gpu_memory_allocated);
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete new_blocks;
|
||||
delete old_blocks;
|
||||
|
||||
delete current_blocks;
|
||||
delete small_lwe_vector;
|
||||
delete luts_message_carry;
|
||||
}
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
|
||||
gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete new_blocks_copy;
|
||||
}
|
||||
};
|
||||
|
||||
// For sequential algorithm in group propagation
|
||||
template <typename Torus> struct int_seq_group_prop_memory {
|
||||
|
||||
@@ -2549,7 +2628,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
// radix_lwe_left except the last blocks of each shift
|
||||
int msb_vector_block_count = num_radix_blocks * (num_radix_blocks - 1) / 2;
|
||||
|
||||
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
|
||||
int total_block_count = num_radix_blocks * num_radix_blocks;
|
||||
|
||||
// allocate memory for intermediate buffers
|
||||
vector_result_sb = new CudaRadixCiphertextFFI;
|
||||
@@ -2562,13 +2641,13 @@ template <typename Torus> struct int_mul_memory {
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, 2 * total_block_count,
|
||||
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
// create int_radix_lut objects for lsb, msb, message, carry
|
||||
// luts_array -> lut = {lsb_acc, msb_acc}
|
||||
luts_array = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, 2, total_block_count,
|
||||
params, 2, 2 * total_block_count,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
auto lsb_acc = luts_array->get_lut(0, 0);
|
||||
auto msb_acc = luts_array->get_lut(0, 1);
|
||||
@@ -2602,9 +2681,10 @@ template <typename Torus> struct int_mul_memory {
|
||||
|
||||
luts_array->broadcast_lut(streams, gpu_indexes, 0);
|
||||
// create memory object for sum ciphertexts
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
|
||||
2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
uint32_t uses_carry = 0;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
@@ -3918,7 +3998,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
zero_out_if_overflow_did_not_happen[0]->get_degree(0),
|
||||
zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, 2, gpu_memory_allocated);
|
||||
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
|
||||
0);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
@@ -3927,7 +4008,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
zero_out_if_overflow_did_not_happen[1]->get_degree(0),
|
||||
zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, 3, gpu_memory_allocated);
|
||||
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
|
||||
0);
|
||||
|
||||
@@ -3954,7 +4036,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
zero_out_if_overflow_happened[0]->get_degree(0),
|
||||
zero_out_if_overflow_happened[0]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, overflow_happened_f, 2, gpu_memory_allocated);
|
||||
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes, 0);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
@@ -3962,7 +4045,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
zero_out_if_overflow_happened[1]->get_degree(0),
|
||||
zero_out_if_overflow_happened[1]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, overflow_happened_f, 3, gpu_memory_allocated);
|
||||
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// merge_overflow_flags_luts
|
||||
|
||||
@@ -248,6 +248,7 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
__uint128_t *temp_lwe_array_in;
|
||||
uint64_t *trivial_indexes;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
bool uses_noise_reduction;
|
||||
@@ -263,11 +264,27 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
cuda_set_device(gpu_index);
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->uses_noise_reduction = allocate_ms_array;
|
||||
this->temp_lwe_array_in =
|
||||
(__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_ms_array);
|
||||
if (allocate_ms_array) {
|
||||
this->temp_lwe_array_in =
|
||||
(__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_ms_array);
|
||||
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
size_tracker, allocate_ms_array);
|
||||
uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
|
||||
for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
|
||||
h_trivial_indexes[i] = i;
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
trivial_indexes, h_trivial_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
allocate_gpu_memory);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
delete[] h_trivial_indexes;
|
||||
}
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
@@ -404,9 +421,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
|
||||
if (uses_noise_reduction)
|
||||
if (uses_noise_reduction) {
|
||||
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -502,7 +522,8 @@ template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count);
|
||||
uint32_t level_count,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
|
||||
@@ -86,13 +86,15 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
|
||||
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus) {
|
||||
void const *lwe_array_in, void const *lwe_array_indexes,
|
||||
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
|
||||
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
|
||||
uint32_t log_modulus) {
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_array_indexes),
|
||||
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
|
||||
num_zeros, input_variance, r_sigma, bound, log_modulus);
|
||||
}
|
||||
|
||||
@@ -178,11 +178,10 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
|
||||
|
||||
// Each thread processes two elements of the lwe array
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
const Torus *zeros, int lwe_size, int num_zeros,
|
||||
double input_variance, double r_sigma,
|
||||
double bound, uint32_t log_modulus) {
|
||||
__global__ void improve_noise_modulus_switch(
|
||||
Torus *array_out, const Torus *array_in, const uint64_t *indexes,
|
||||
const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus) {
|
||||
|
||||
// First we will assume size is less than the number of threads per block
|
||||
// I should switch this to dynamic shared memory
|
||||
@@ -198,13 +197,13 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
// This probably are not needed cause we are setting the values
|
||||
sum_mask_errors[threadIdx.x] = 0.f;
|
||||
sum_squared_mask_errors[threadIdx.x] = 0.f;
|
||||
auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
|
||||
auto this_block_lwe_out = array_out + indexes[blockIdx.x] * lwe_size;
|
||||
Torus input_element1 = this_block_lwe_in[threadIdx.x];
|
||||
|
||||
Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
|
||||
|
||||
Torus input_element2 =
|
||||
threadIdx.x + blockDim.x < lwe_size
|
||||
? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
|
||||
: 0;
|
||||
Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
|
||||
? this_block_lwe_in[threadIdx.x + blockDim.x]
|
||||
: 0;
|
||||
|
||||
// Base noise is only handled by thread 0
|
||||
double base_noise = measure_modulus_switch_noise<Torus>(
|
||||
@@ -218,11 +217,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
__syncthreads();
|
||||
|
||||
if (found)
|
||||
array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
|
||||
this_block_lwe_out[threadIdx.x] = input_element1;
|
||||
|
||||
if (found && (threadIdx.x + blockDim.x) < lwe_size)
|
||||
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
|
||||
input_element2;
|
||||
this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;
|
||||
|
||||
__syncthreads();
|
||||
// If we found a zero element we stop iterating (in avg 20 times are
|
||||
@@ -253,11 +251,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
// Assumption we always have at least 512 elements
|
||||
// If we find a useful zero encryption we replace the lwe by lwe + zero
|
||||
if (found)
|
||||
array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
|
||||
this_block_lwe_out[threadIdx.x] = zero_element1;
|
||||
|
||||
if (found && (threadIdx.x + blockDim.x) < lwe_size)
|
||||
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
|
||||
zero_element2;
|
||||
this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;
|
||||
|
||||
__syncthreads();
|
||||
// If we found a zero element we stop iterating (in avg 20 times are
|
||||
@@ -270,9 +267,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
template <typename Torus>
|
||||
__host__ void host_improve_noise_modulus_switch(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
|
||||
Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
|
||||
const double r_sigma, const double bound, uint32_t log_modulus) {
|
||||
Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
|
||||
uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
|
||||
const double input_variance, const double r_sigma, const double bound,
|
||||
uint32_t log_modulus) {
|
||||
|
||||
if (lwe_size < 512) {
|
||||
PANIC("The lwe_size is less than 512, this is not supported\n");
|
||||
@@ -289,8 +287,8 @@ __host__ void host_improve_noise_modulus_switch(
|
||||
int num_threads = 512, num_blocks = num_lwes;
|
||||
|
||||
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
|
||||
bound, log_modulus);
|
||||
array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
|
||||
r_sigma, bound, log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -492,6 +492,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
|
||||
batch_convert_u128_to_f128_as_integer<params>
|
||||
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
|
||||
d_standard);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// call negacyclic 128 bit forward fft.
|
||||
if (full_sm) {
|
||||
@@ -503,6 +504,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
|
||||
@@ -63,7 +63,7 @@ void update_degrees_after_bitor(uint64_t *output_degrees,
|
||||
auto result = max;
|
||||
|
||||
for (uint j = 0; j < min + 1; j++) {
|
||||
if (max | j > result) {
|
||||
if ((max | j) > result) {
|
||||
result = max | j;
|
||||
}
|
||||
}
|
||||
@@ -82,7 +82,7 @@ void update_degrees_after_bitxor(uint64_t *output_degrees,
|
||||
|
||||
// Try every possibility to find the worst case
|
||||
for (uint j = 0; j < min + 1; j++) {
|
||||
if (max ^ j > result) {
|
||||
if ((max ^ j) > result) {
|
||||
result = max ^ j;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
update_degrees_after_bitor(degrees, lwe_array_1->degrees,
|
||||
lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks);
|
||||
} else if (mem_ptr->op == BITXOR) {
|
||||
} else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
|
||||
update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
|
||||
lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks);
|
||||
|
||||
17
backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
Normal file
17
backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "cast.cuh"
|
||||
|
||||
void extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams, uint32_t const *gpu_indexes) {
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
|
||||
output, input, (cudaStream_t *)streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
void *const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
|
||||
host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
|
||||
gpu_indexes);
|
||||
}
|
||||
36
backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
Normal file
36
backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
Normal file
@@ -0,0 +1,36 @@
|
||||
#ifndef CAST_CUH
|
||||
#define CAST_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
|
||||
0, input->num_radix_blocks, input, 0,
|
||||
input->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
|
||||
const uint32_t input_start_lwe_index =
|
||||
input->num_radix_blocks - output->num_radix_blocks;
|
||||
|
||||
if (input->num_radix_blocks <= output->num_radix_blocks) {
|
||||
PANIC("Cuda error: input num blocks should be greater than output num "
|
||||
"blocks");
|
||||
}
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
|
||||
input_start_lwe_index, input->num_radix_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -456,7 +456,7 @@ __host__ void tree_sign_reduction(
|
||||
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
||||
while (partial_block_count > 2) {
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
|
||||
4);
|
||||
message_modulus);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
|
||||
@@ -477,16 +477,17 @@ __host__ void tree_sign_reduction(
|
||||
auto last_lut = tree_buffer->tree_last_leaf_lut;
|
||||
auto block_selector_f = tree_buffer->block_selector_f;
|
||||
std::function<Torus(Torus)> f;
|
||||
|
||||
auto num_bits_in_message = log2_int(params.message_modulus);
|
||||
if (partial_block_count == 2) {
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
|
||||
4);
|
||||
message_modulus);
|
||||
|
||||
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
int lsb = x & 3;
|
||||
f = [block_selector_f, sign_handler_f, num_bits_in_message,
|
||||
message_modulus](Torus x) -> Torus {
|
||||
Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
|
||||
Torus lsb = x & (message_modulus - 1);
|
||||
|
||||
int final_sign = block_selector_f(msb, lsb);
|
||||
Torus final_sign = block_selector_f(msb, lsb);
|
||||
return sign_handler_f(final_sign);
|
||||
};
|
||||
} else {
|
||||
|
||||
@@ -386,8 +386,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
subtraction_overflowed,
|
||||
at_least_one_upper_block_is_non_zero, 1);
|
||||
|
||||
int factor = (i) ? 3 : 2;
|
||||
int factor_lut_id = factor - 2;
|
||||
auto message_modulus = radix_params.message_modulus;
|
||||
int factor = (i) ? message_modulus - 1 : message_modulus - 2;
|
||||
int factor_lut_id = (i) ? 1 : 0;
|
||||
for (size_t k = 0;
|
||||
k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
|
||||
@@ -520,8 +520,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
if (num_radix_blocks > lut->num_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of lut radix blocks")
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
|
||||
num_radix_blocks > lwe_array_in->num_radix_blocks)
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of input & output radix blocks")
|
||||
|
||||
@@ -1291,7 +1290,7 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
}
|
||||
|
||||
// This function is used to perform step 2 of Thomas' new propagation algorithm
|
||||
// Consist three steps:
|
||||
// Consists of three steps:
|
||||
// - propagates the carry within each group with cheap LWE operations stored in
|
||||
// simulators
|
||||
// - calculates the propagation state of each group
|
||||
@@ -1616,10 +1615,12 @@ __host__ void reduce_signs(
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto num_bits_in_message = log2_int(message_modulus);
|
||||
std::function<Torus(Torus)> reduce_two_orderings_function =
|
||||
[diff_buffer, sign_handler_f](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
int lsb = x & 3;
|
||||
[diff_buffer, sign_handler_f, num_bits_in_message,
|
||||
message_modulus](Torus x) -> Torus {
|
||||
Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
|
||||
Torus lsb = x & (message_modulus - 1);
|
||||
|
||||
return diff_buffer->tree_buffer->block_selector_f(msb, lsb);
|
||||
};
|
||||
@@ -1640,7 +1641,7 @@ __host__ void reduce_signs(
|
||||
|
||||
while (num_sign_blocks > 2) {
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
num_sign_blocks, 4);
|
||||
num_sign_blocks, message_modulus);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, num_sign_blocks / 2);
|
||||
@@ -1669,7 +1670,8 @@ __host__ void reduce_signs(
|
||||
message_modulus, carry_modulus, final_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a, 2, 4);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
num_sign_blocks, message_modulus);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
@@ -1677,8 +1679,8 @@ __host__ void reduce_signs(
|
||||
} else {
|
||||
|
||||
std::function<Torus(Torus)> final_lut_f =
|
||||
[mem_ptr, sign_handler_f](Torus x) -> Torus {
|
||||
return sign_handler_f(x & 3);
|
||||
[mem_ptr, sign_handler_f, message_modulus](Torus x) -> Torus {
|
||||
return sign_handler_f(x & (message_modulus - 1));
|
||||
};
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
|
||||
|
||||
@@ -226,7 +226,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
@@ -234,64 +235,59 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
PANIC("Cuda error: input vector length should be a multiple of the "
|
||||
"output's number of radix blocks")
|
||||
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
|
||||
// the general case
|
||||
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
|
||||
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
|
||||
}
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
|
||||
@@ -20,28 +20,11 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <queue>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
|
||||
int32_t *id_in, size_t lwe_size) {
|
||||
size_t tid = threadIdx.x;
|
||||
size_t b_id = blockIdx.x;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto input_id = id_in[b_id];
|
||||
auto output_id = id_out[b_id];
|
||||
|
||||
auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
|
||||
auto cur_dst = &dst[output_id * lwe_size];
|
||||
|
||||
for (int i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
@@ -94,33 +77,173 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t block_size,
|
||||
uint32_t num_blocks) {
|
||||
__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
|
||||
uint32_t *const columns_counter,
|
||||
const uint64_t *const degrees,
|
||||
const uint32_t num_radix_blocks,
|
||||
const uint32_t num_radix_in_vec) {
|
||||
|
||||
size_t stride = blockDim.x;
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
|
||||
size_t radix_elem_size = num_blocks * block_size;
|
||||
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
|
||||
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
|
||||
size_t block_stride = blockIdx.y * block_size;
|
||||
auto result = &dst_radix[block_stride];
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
result[i] = src_chunk[block_stride + i];
|
||||
}
|
||||
|
||||
// accumulate rest of the radixes
|
||||
for (int r_id = 1; r_id < chunk_size; r_id++) {
|
||||
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
result[i] += cur_src_radix[block_stride + i];
|
||||
const uint32_t idx = threadIdx.x;
|
||||
size_t cnt = 0;
|
||||
for (int i = 0; i < num_radix_in_vec; i++) {
|
||||
size_t ct_id = i * num_radix_blocks + idx;
|
||||
if (degrees[ct_id] != 0) {
|
||||
columns[idx][cnt] = ct_id;
|
||||
++cnt;
|
||||
}
|
||||
}
|
||||
columns_counter[idx] = cnt;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_new_columns_and_pbs_indexes(
|
||||
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, uint32_t *const pbs_counters,
|
||||
const uint32_t *const *const columns, const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size) {
|
||||
__shared__ uint32_t counter, sharedOr;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
counter = 0;
|
||||
sharedOr = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t base_id = threadIdx.x;
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
uint32_t ct_count = 0;
|
||||
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
|
||||
// those indexes are for message ciphertexts
|
||||
// for message ciphertexts in and out index should be same
|
||||
const uint32_t in_index = columns[base_id][i];
|
||||
new_columns[base_id][ct_count] = in_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = in_index;
|
||||
lut_indexes[pbs_index] = 0;
|
||||
++ct_count;
|
||||
}
|
||||
// ct1 ct2 ct3
|
||||
// pbs_indexes: 0, 1, 2
|
||||
// pbs_indexes: 2, 1, 0
|
||||
|
||||
__syncthreads();
|
||||
uint32_t message_count = counter;
|
||||
|
||||
if (base_id > 0) {
|
||||
const uint32_t prev_base_id = base_id - 1;
|
||||
const uint32_t prev_column_len = columns_counter[prev_base_id];
|
||||
|
||||
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
|
||||
// those indexes are for carry ciphertexts
|
||||
// for carry ciphertexts input is same as for message
|
||||
// output will be placed to next block in the column
|
||||
const uint32_t in_index = columns[prev_base_id][i];
|
||||
const uint32_t out_index = columns[prev_base_id][i + 1];
|
||||
new_columns[base_id][ct_count] = out_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = out_index;
|
||||
lut_indexes[pbs_index] = 1;
|
||||
++ct_count;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t start_index = column_len - column_len % chunk_size;
|
||||
for (uint32_t i = start_index; i < column_len; ++i) {
|
||||
new_columns[base_id][ct_count] = columns[base_id][i];
|
||||
++ct_count;
|
||||
}
|
||||
|
||||
new_columns_counter[base_id] = ct_count;
|
||||
|
||||
if (ct_count > chunk_size) {
|
||||
atomicOr(&sharedOr, 1);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
pbs_counters[0] = counter;
|
||||
pbs_counters[1] = message_count;
|
||||
pbs_counters[2] = sharedOr;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_final_pbs_indexes(
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
|
||||
int idx = threadIdx.x;
|
||||
pbs_indexes_in[idx] = idx % num_radix_blocks;
|
||||
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
|
||||
lut_indexes[idx] = idx / num_radix_blocks;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_chunks(Torus *const input_blocks,
|
||||
const uint32_t *const *const columns,
|
||||
const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size,
|
||||
const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
if (column_len >= chunk_size) {
|
||||
const uint32_t num_chunks = column_len / chunk_size;
|
||||
Torus result = 0;
|
||||
|
||||
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
|
||||
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
|
||||
const uint32_t cur_ct_id =
|
||||
columns[base_id][chunk_id * chunk_size + ct_id];
|
||||
result += input_blocks[cur_ct_id * block_size + coef_id];
|
||||
}
|
||||
|
||||
input_blocks[first_ct_id * block_size + coef_id] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_final_chunk_into_radix(
|
||||
Torus *const out_radix, const Torus *const input_blocks,
|
||||
const uint32_t *const *const columns, const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size, const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
Torus result = 0;
|
||||
if (column_len) {
|
||||
const uint32_t first_ct_id = columns[base_id][0];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t i = 1; i < column_len; ++i) {
|
||||
const uint32_t cur_ct_it = columns[base_id][i];
|
||||
result += input_blocks[cur_ct_it * block_size + coef_id];
|
||||
}
|
||||
}
|
||||
out_radix[base_id * block_size + coef_id] = result;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -167,6 +290,65 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
(process_msb) ? cur_msb_ct[params::degree] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool at_least_one_column_needs_processing(
|
||||
const uint64_t *const degrees, const uint32_t num_radix_blocks,
|
||||
const uint32_t num_radix_in_vec, const uint32_t chunk_size) {
|
||||
std::vector<uint32_t> columns_count(num_radix_blocks, 0);
|
||||
|
||||
for (size_t column = 0; column < num_radix_blocks; ++column) {
|
||||
for (size_t block = 0; block < num_radix_in_vec; ++block) {
|
||||
const size_t block_index = block * num_radix_blocks + column;
|
||||
if (degrees[block_index]) {
|
||||
columns_count[column]++;
|
||||
if (columns_count[column] > chunk_size) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void calculate_final_degrees(uint64_t *const out_degrees,
|
||||
const uint64_t *const input_degrees,
|
||||
size_t num_blocks, size_t num_radix_in_vec,
|
||||
size_t chunk_size,
|
||||
uint64_t message_modulus) {
|
||||
|
||||
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
|
||||
return std::min(message_modulus - 1, degree);
|
||||
};
|
||||
std::vector<std::queue<uint64_t>> columns(num_blocks);
|
||||
for (size_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (size_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns[j].push(input_degrees[i * num_blocks + j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
auto &col = columns[i];
|
||||
while (col.size() > 1) {
|
||||
uint32_t cur_degree = 0;
|
||||
size_t mn = std::min(chunk_size, col.size());
|
||||
for (int j = 0; j < mn; ++j) {
|
||||
cur_degree += col.front();
|
||||
col.pop();
|
||||
}
|
||||
const uint64_t new_degree = get_degree(cur_degree);
|
||||
col.push(new_degree);
|
||||
if ((i + 1) < num_blocks) {
|
||||
columns[i + 1].push(new_degree);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -181,15 +363,107 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
__global__ inline void DEBUG_PRINT_COLUMNS(uint32_t *const *const columns,
|
||||
uint32_t *const columns_counter,
|
||||
const uint32_t num_radix_blocks) {
|
||||
printf("cuda_columns_counter:\n");
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
printf("%d ", columns_counter[i]);
|
||||
}
|
||||
printf("\n");
|
||||
printf("cuda_columns:\n");
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
printf("column[%d]: ", i);
|
||||
for (int j = 0; j < columns_counter[i]; j++)
|
||||
{
|
||||
printf("%d ", columns[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
}
|
||||
|
||||
__global__ inline void DEBUG_PRINT_COLUMNS_DATA(uint32_t *const *const columns,
|
||||
uint32_t *const columns_counter,
|
||||
uint64_t* data,
|
||||
const uint32_t num_radix_blocks, size_t lwe_size) {
|
||||
|
||||
uint64_t delta = 576460752303423488ULL;
|
||||
__syncthreads();
|
||||
printf("cuda_new_columns:\n");
|
||||
__syncthreads();
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
__syncthreads();
|
||||
printf("column[%d]: ", i);
|
||||
__syncthreads();
|
||||
for (int j = 0; j < columns_counter[i]; j++)
|
||||
{
|
||||
__syncthreads();
|
||||
auto cur_data =data[ columns[i][j] * lwe_size + lwe_size - 1];
|
||||
cur_data /= delta;
|
||||
printf("%llu ", cur_data);
|
||||
__syncthreads();
|
||||
}
|
||||
__syncthreads();
|
||||
printf("\n");
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
printf("\n");
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
template<typename Torus, bool input, bool clear>
|
||||
__global__ inline void DEBUG_PRINT_PBS_DATA(Torus * data, Torus* input_indexes, Torus*
|
||||
output_indexes, Torus *lut_indexes, size_t lwe_size, int num) {
|
||||
printf("input_output_indexes: \n");
|
||||
|
||||
for (int i = 0; i < num; i++) {
|
||||
auto input_val = data[input_indexes[i] * lwe_size + lwe_size -1];
|
||||
auto output_val = data[output_indexes[i] * lwe_size + lwe_size -1];
|
||||
|
||||
auto val = input ? input_val : output_val;
|
||||
auto val_clear = clear ? val / 576460752303423488ULL : val;
|
||||
|
||||
printf("%d %lu %lu %lu %lu %lu\n", i, input_indexes[i], output_indexes[i], lut_indexes[i],
|
||||
val_clear, val);
|
||||
}
|
||||
}
|
||||
|
||||
//template<typename Torus>
|
||||
//__global__ inline void DEBUG_PRINT_RADIX(Torus * data, size_t num_blocks, size_t lwe_size) {
|
||||
// for (int i = 0; i < num_blocks; i++) {
|
||||
// auto val = data[i * lwe_size + lwe_size - 1];
|
||||
// auto val_clear = val / 576460752303423488ULL;
|
||||
// printf("cuda_partial_sum_result: %lu %lu\n", val, val_clear);
|
||||
// }
|
||||
//}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
CudaRadixCiphertextFFI *terms, bool reduce_degrees_for_single_carry_propagation, void *const
|
||||
*bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut) {
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
|
||||
// cudaDeviceSynchronize();
|
||||
// print_body<Torus>("cuda_input_partial_sum", (Torus*)terms->ptr, num_radix_blocks * num_radix_in_vec,
|
||||
// 2048,
|
||||
// 576460752303423488ULL);
|
||||
|
||||
// for (int i = 0; i <num_radix_blocks * num_radix_in_vec; i++ ) {
|
||||
// printf("cuda_input_degrees: %d\n", terms->degrees[i]);
|
||||
// }
|
||||
// cudaDeviceSynchronize();
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
|
||||
if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
|
||||
PANIC("Cuda error: output and input radix ciphertexts should have the same "
|
||||
@@ -199,22 +473,28 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
PANIC("Cuda error: input vector does not have enough blocks")
|
||||
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
|
||||
PANIC("Cuda error: output does not have enough blocks")
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto new_blocks_copy = mem_ptr->new_blocks_copy;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
|
||||
auto current_blocks = mem_ptr->current_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
auto d_degrees = mem_ptr->d_degrees;
|
||||
auto d_columns = mem_ptr->d_columns;
|
||||
auto d_columns_counter = mem_ptr->d_columns_counter;
|
||||
auto d_new_columns = mem_ptr->d_new_columns;
|
||||
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
auto d_pbs_counters = mem_ptr->d_pbs_counters;
|
||||
|
||||
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
|
||||
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto small_lwe_size = small_lwe_dimension + 1;
|
||||
auto chunk_size =
|
||||
(mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
|
||||
(mem_ptr->params.message_modulus - 1);
|
||||
|
||||
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t num_many_lut = 1;
|
||||
@@ -228,244 +508,202 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
terms, 0, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
if (old_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
|
||||
terms);
|
||||
}
|
||||
|
||||
if (num_radix_in_vec == 2) {
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
CudaRadixCiphertextFFI terms_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
|
||||
2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
|
||||
&terms_slice, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t r = num_radix_in_vec;
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
size_t message_max = message_modulus - 1;
|
||||
size_t chunk_size = (total_modulus - 1) / message_max;
|
||||
|
||||
size_t h_lwe_idx_in[terms->num_radix_blocks];
|
||||
size_t h_lwe_idx_out[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_in[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_out[terms->num_radix_blocks];
|
||||
|
||||
/// Here it is important to query the default max shared memory on device 0
|
||||
/// instead of cuda_get_max_shared_memory,
|
||||
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
|
||||
auto max_shared_memory = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
if (reused_lut == nullptr) {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, true, nullptr);
|
||||
} else {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
|
||||
if (current_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
current_blocks, terms);
|
||||
}
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
|
||||
total_blocks_in_vec * sizeof(uint64_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc,
|
||||
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_message, true);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry, true);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
|
||||
num_radix_in_vec);
|
||||
|
||||
while (r > 2) {
|
||||
size_t cur_total_blocks = r * num_radix_blocks;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_radix_blocks, 1);
|
||||
bool needs_processing = at_least_one_column_needs_processing(
|
||||
current_blocks->degrees, num_radix_blocks, num_radix_in_vec, chunk_size);
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
|
||||
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
|
||||
int number_of_threads = min(256, params::degree);
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
// h_pbs_counters[0] - total ciphertexts
|
||||
// h_pbs_counters[1] - message ciphertexts
|
||||
// h_pbs_counters[2] - at_least_one_column_needs_processing
|
||||
uint32_t *h_pbs_counters;
|
||||
cudaMallocHost((void **)&h_pbs_counters, 3 * sizeof(uint32_t));
|
||||
if (mem_ptr->mem_reuse) {
|
||||
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
size_t total_count = 0;
|
||||
size_t message_count = 0;
|
||||
size_t carry_count = 0;
|
||||
size_t sm_copy_count = 0;
|
||||
|
||||
generate_ids_update_degrees(
|
||||
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
|
||||
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
|
||||
message_max, total_count, message_count, carry_count, sm_copy_count);
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
h_lwe_idx_in, h_lwe_idx_out);
|
||||
while (needs_processing) {
|
||||
// cudaDeviceSynchronize();
|
||||
// DEBUG_PRINT_COLUMNS<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, num_radix_blocks);
|
||||
// DEBUG_PRINT_COLUMNS_DATA<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, (uint64_t *)
|
||||
// (current_blocks->ptr), num_radix_blocks, big_lwe_size);
|
||||
calculate_chunks<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
|
||||
chunk_size, big_lwe_size);
|
||||
|
||||
size_t copy_size = sm_copy_count * sizeof(int32_t);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
|
||||
d_pbs_counters, d_columns, d_columns_counter, chunk_size);
|
||||
|
||||
// inside d_smart_copy_in there are only -1 values
|
||||
// it's fine to call smart_copy with same pointer
|
||||
// as source and destination
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
|
||||
new_blocks, 0, r * num_radix_blocks);
|
||||
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
|
||||
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_memcpy_async_to_cpu(h_pbs_counters, d_pbs_counters,
|
||||
3 * sizeof(uint32_t), streams[0], gpu_indexes[0]);
|
||||
|
||||
if (carry_count > 0)
|
||||
cuda_set_value_async<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
luts_message_carry->get_lut_indexes(0, message_count), 1,
|
||||
carry_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
const uint32_t total_ciphertexts = h_pbs_counters[0];
|
||||
const uint32_t total_messages = h_pbs_counters[1];
|
||||
needs_processing = (h_pbs_counters[2] != 0);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
|
||||
|
||||
// DEBUG_PRINT_PBS_DATA<Torus, true, true><<<1, 1, 0, streams[0]>>>(
|
||||
// (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
// luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
|
||||
// );
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
|
||||
std::vector<Torus *> small_lwe_vector_vec =
|
||||
luts_message_carry->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec =
|
||||
luts_message_carry->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
luts_message_carry->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
|
||||
polynomial_size * glwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
|
||||
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
|
||||
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
big_lwe_size);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
|
||||
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
|
||||
total_messages);
|
||||
|
||||
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
|
||||
/// different configuration
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
|
||||
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
small_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
|
||||
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
small_lwe_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->lwe_indexes_in,
|
||||
total_ciphertexts * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->lwe_indexes_out,
|
||||
total_ciphertexts * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
|
||||
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
big_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
for (uint i = 0; i < total_count; i++) {
|
||||
auto degrees_index = luts_message_carry->h_lut_indexes[i];
|
||||
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
|
||||
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks,
|
||||
current_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
luts_message_carry, total_ciphertexts);
|
||||
}
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
// DEBUG_PRINT_PBS_DATA<Torus, false, true><<<1, 1, 0, streams[0]>>>(
|
||||
// (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
// luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
|
||||
// );
|
||||
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
|
||||
|
||||
if (rem_blocks > 0)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
|
||||
new_blocks_created + rem_blocks, old_blocks,
|
||||
cur_total_blocks - rem_blocks, cur_total_blocks);
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
|
||||
std::swap(d_columns, d_new_columns);
|
||||
std::swap(d_columns_counter, d_new_columns_counter);
|
||||
}
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete (luts_message_carry);
|
||||
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
cudaFreeHost(h_pbs_counters);
|
||||
calculate_final_chunk_into_radix<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
|
||||
d_columns, d_columns_counter, chunk_size, big_lwe_size);
|
||||
|
||||
if (reduce_degrees_for_single_carry_propagation) {
|
||||
prepare_final_pbs_indexes<Torus><<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
|
||||
|
||||
cuda_memset_async(
|
||||
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
|
||||
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count);
|
||||
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->lwe_indexes_in,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->lwe_indexes_out,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
|
||||
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
|
||||
2 * num_radix_blocks);
|
||||
}
|
||||
// cudaDeviceSynchronize();
|
||||
// print_body<Torus>("cuda_before_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
|
||||
// 576460752303423488ULL);
|
||||
// cudaDeviceSynchronize();
|
||||
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
|
||||
num_radix_blocks, num_radix_in_vec, chunk_size,
|
||||
mem_ptr->params.message_modulus);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
CudaRadixCiphertextFFI current_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_blocks_slice, current_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
|
||||
current_blocks, ¤t_blocks_slice, num_radix_blocks);
|
||||
// printf("add_happened\n");
|
||||
}
|
||||
|
||||
|
||||
// cudaDeviceSynchronize();
|
||||
//
|
||||
// print_body<Torus>("cuda_out_after_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
|
||||
// 576460752303423488ULL);
|
||||
// cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -599,14 +837,28 @@ __host__ void host_integer_mult_radix_kb(
|
||||
size_t b_id = i % num_blocks;
|
||||
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < num_blocks * 2 * num_blocks; i++)
|
||||
{
|
||||
auto cur_ptr = (Torus*)vector_result_sb->ptr;
|
||||
cur_ptr += i * 2049 + 2048;
|
||||
print_debug<Torus>("", cur_ptr, 1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_blocks * 2 * num_blocks; i++) {
|
||||
printf("%d\n", vector_result_sb->degrees[i]);
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks, mem_ptr->luts_array);
|
||||
2 * num_blocks);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, ms_noise_reduction_key, requested_flag,
|
||||
|
||||
@@ -115,13 +115,10 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_radix_blocks);
|
||||
} else {
|
||||
for (int i = 0; i < j * num_radix_blocks; i++) {
|
||||
all_shifted_buffer->degrees[i] = message_modulus - 1;
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true, bsks,
|
||||
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j, nullptr);
|
||||
num_radix_blocks, j);
|
||||
|
||||
auto scp_mem_ptr = mem->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
|
||||
@@ -261,6 +261,8 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
|
||||
default:
|
||||
break;
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -279,6 +279,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
PANIC("Cuda error (convert KSK): unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(d_bsk, stream, gpu_index);
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
@@ -315,6 +316,7 @@ void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
|
||||
// convert u128 into 4 x double
|
||||
batch_convert_u128_to_f128_strided_as_torus<params>
|
||||
<<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// call negacyclic 128 bit forward fft.
|
||||
if (full_sm) {
|
||||
@@ -326,6 +328,7 @@ void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
|
||||
buffer);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -194,7 +194,8 @@ void execute_pbs_async(
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
void *zeros = nullptr;
|
||||
if (ms_noise_reduction_key != nullptr)
|
||||
if (ms_noise_reduction_key != nullptr &&
|
||||
ms_noise_reduction_key->ptr != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
|
||||
@@ -660,13 +660,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key != nullptr) {
|
||||
if (ms_noise_reduction_key != nullptr &&
|
||||
ms_noise_reduction_key->ptr != nullptr) {
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
buffer->temp_lwe_array_in,
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
|
||||
num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
@@ -846,4 +848,7 @@ template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
template bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
__uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
#endif
|
||||
|
||||
@@ -256,6 +256,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<__uint128_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(buffer->trivial_indexes),
|
||||
static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
|
||||
@@ -398,20 +398,32 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
// On H100s we should be using TBC until num_samples < num_sms / 2.
|
||||
// After that we switch to CG until not supported anymore.
|
||||
// At this point we return to TBC.
|
||||
int num_sms = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
bool supports_tbc =
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, cuda_get_max_shared_memory(gpu_index)))
|
||||
level_count, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
if (supports_tbc &&
|
||||
!(input_lwe_ciphertext_count > num_sms / 2 && supports_cg))
|
||||
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index)))
|
||||
if (supports_cg)
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
|
||||
@@ -37,18 +37,21 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
|
||||
__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
|
||||
T body = src[i * (lwe_dimension + 1) + lwe_dimension];
|
||||
T clear = body / delta;
|
||||
printf("(%lu, %lu), ", body, clear);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print_body(const char *name, T *src, int n, int lwe_dimension) {
|
||||
void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
|
||||
printf("%s: ", name);
|
||||
cudaDeviceSynchronize();
|
||||
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
|
||||
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
|
||||
cudaDeviceSynchronize();
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -168,7 +168,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
pbs_level)) {
|
||||
pbs_level, cuda_get_max_shared_memory(0))) {
|
||||
st.SkipWithError("Configuration not supported for tbc operation");
|
||||
return;
|
||||
}
|
||||
@@ -256,7 +256,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
pbs_level)) {
|
||||
pbs_level, cuda_get_max_shared_memory(0))) {
|
||||
st.SkipWithError("Configuration not supported for tbc operation");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -50,6 +50,7 @@ unsafe extern "C" {
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_array_indexes: *const ffi::c_void,
|
||||
encrypted_zeros: *const ffi::c_void,
|
||||
lwe_size: u32,
|
||||
num_lwes: u32,
|
||||
@@ -1017,6 +1018,7 @@ unsafe extern "C" {
|
||||
gpu_count: u32,
|
||||
radix_lwe_out: *mut CudaRadixCiphertextFFI,
|
||||
radix_lwe_vec: *mut CudaRadixCiphertextFFI,
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
@@ -1315,6 +1317,22 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn trim_radix_blocks_lsb_64(
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
);
|
||||
}
|
||||
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
|
||||
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
|
||||
pub type KS_TYPE = ffi::c_uint;
|
||||
|
||||
3
backends/tfhe-hpu-backend/.gitattributes
vendored
Normal file
3
backends/tfhe-hpu-backend/.gitattributes
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.xclbin filter=lfs diff=lfs merge=lfs -text
|
||||
*.pdi filter=lfs diff=lfs merge=lfs -text
|
||||
python/lib/example.json filter=lfs diff=lfs merge=lfs -text
|
||||
3
backends/tfhe-hpu-backend/.gitignore
vendored
Normal file
3
backends/tfhe-hpu-backend/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
ngt_*
|
||||
config
|
||||
kogge_cfg.toml
|
||||
88
backends/tfhe-hpu-backend/Cargo.toml
Normal file
88
backends/tfhe-hpu-backend/Cargo.toml
Normal file
@@ -0,0 +1,88 @@
|
||||
[package]
|
||||
name = "tfhe-hpu-backend"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "BSD-3-Clause-Clear"
|
||||
description = "HPU implementation on FPGA of TFHE-rs primitives."
|
||||
homepage = "https://www.zama.ai/"
|
||||
documentation = "https://docs.zama.ai/tfhe-rs"
|
||||
repository = "https://github.com/zama-ai/tfhe-rs"
|
||||
readme = "README.md"
|
||||
keywords = ["encryption", "fhe", "cryptography", "hardware", "fpga"]
|
||||
|
||||
[features]
|
||||
hw-xrt = []
|
||||
hw-v80 = []
|
||||
io-dump = ["num-traits"]
|
||||
rtl_graph = ["dot2"]
|
||||
utils = ["clap", "clap-num", "bitvec", "serde_json"]
|
||||
|
||||
[build-dependencies]
|
||||
cxx-build = "1.0"
|
||||
|
||||
[dependencies]
|
||||
cxx = "1.0"
|
||||
hw_regmap = "0.1.0"
|
||||
|
||||
strum = { version = "0.26.2", features = ["derive"] }
|
||||
strum_macros = "0.26.2"
|
||||
enum_dispatch = "0.3.13"
|
||||
tracing = "0.1.40"
|
||||
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
toml = { version = "0.8", features = [] }
|
||||
paste = "1.0.15"
|
||||
thiserror = "1.0.61"
|
||||
bytemuck = "1.16.0"
|
||||
anyhow = "1.0.82"
|
||||
lazy_static = "1.4.0"
|
||||
rand = "0.8.5"
|
||||
regex = "1.10.4"
|
||||
bitflags = { version = "2.5.0", features = ["serde"] }
|
||||
itertools = "0.11.0"
|
||||
lru = "0.12.3"
|
||||
bitfield-struct = "0.10.0"
|
||||
crossbeam = { version = "0.8.4", features = ["crossbeam-queue"] }
|
||||
rayon = { workspace = true }
|
||||
|
||||
# Dependencies used for Sim feature
|
||||
ipc-channel = "0.18.3"
|
||||
|
||||
# Dependencies used for debug feature
|
||||
num-traits = { version = "0.2", optional = true }
|
||||
clap = { version = "4.4.4", features = ["derive"], optional = true }
|
||||
clap-num = { version = "1.1.1", optional = true }
|
||||
nix = { version = "0.29.0", features = ["ioctl", "uio"] }
|
||||
|
||||
# Dependencies used for rtl_graph features
|
||||
dot2 = { version = "1.0", optional = true }
|
||||
|
||||
bitvec = { version = "1.0", optional = true }
|
||||
serde_json = { version = "1.0", optional = true }
|
||||
|
||||
# Binary for manual debugging
|
||||
# Enable to access Hpu register and drive some custom sequence by hand
|
||||
[[bin]]
|
||||
name = "hputil"
|
||||
path = "src/utils/hputil.rs"
|
||||
required-features = ["utils"]
|
||||
|
||||
# Binary for asm manipulation
|
||||
# Enable to convert back and forth between asm/hex format
|
||||
[[bin]]
|
||||
name = "dop_fmt"
|
||||
path = "src/utils/dop_fmt.rs"
|
||||
required-features = ["utils"]
|
||||
|
||||
# Enable to convert back and forth between asm/hex format
|
||||
[[bin]]
|
||||
name = "iop_fmt"
|
||||
path = "src/utils/iop_fmt.rs"
|
||||
required-features = ["utils"]
|
||||
|
||||
# Firmware generation
|
||||
# Enable to expand IOp in list of Dop for inspection
|
||||
[[bin]]
|
||||
name = "fw"
|
||||
path = "src/utils/fw.rs"
|
||||
required-features = ["utils"]
|
||||
28
backends/tfhe-hpu-backend/LICENSE
Normal file
28
backends/tfhe-hpu-backend/LICENSE
Normal file
@@ -0,0 +1,28 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2025 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
|
||||
or promote products derived from this software without specific prior written permission.
|
||||
|
||||
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
|
||||
THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
|
||||
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
261
backends/tfhe-hpu-backend/README.md
Normal file
261
backends/tfhe-hpu-backend/README.md
Normal file
@@ -0,0 +1,261 @@
|
||||
# TFHE-hpu-backend
|
||||
|
||||
## Brief
|
||||
The `tfhe-hpu-backend` holds the code to interface with the HPU accelerator of TFHE.
|
||||
It contains a `HpuDevice` abstraction that enables easy configuration and dispatching of TFHE operations on the HPU accelerator.
|
||||
|
||||
The user API exposes the following functions for hardware setup:
|
||||
- `HpuDevice::new`, `HpuDevice::from_config`: Instantiates abstraction device from configuration file.
|
||||
- `HpuDevice::init`: Configures and uploads the required public material.
|
||||
- `new_var_from`: Creates a HPU ciphertext from `tfhe-rs` ciphertext.
|
||||
|
||||
HPU device could also be used from `integer` with the help of the following function:
|
||||
- `tfhe::integer::hpu::init_device`: Init given HPU device with server key.
|
||||
- `tfhe::integer::hpu::ciphertext::HpuRadixCiphertext::from_radix_ciphertext`: Convert a CpuRadixCiphertext in it's HPU counterpart.
|
||||
|
||||
HPU device could also be used seamlessly from `hl-api` by setting up a thread-local HPU server key:
|
||||
- `tfhe::Config::from_hpu_device`: Extract hl-api configuration from HpuDevice.
|
||||
- `tfhe::set_server_key`: Register the Hpu server key in the current thread.
|
||||
|
||||
HPU variables could also be created from a `high-level-api` object, with the help of the `hw-xfer` feature.
|
||||
This implements a trait that enables `clone_on`, `mv_on` `FheUint` object on the HPU accelerator, and cast back `from` them.
|
||||
|
||||
These objects implement the `std::ops` trait and could be used to dispatch operations on HPU hardware.
|
||||
|
||||
### Backend structure
|
||||
`tfhe-hpu-backend` is split in various modules:
|
||||
- `entities`: Defines structure handled by HPU accelerator. Conversion traits from/into those objects are implemented in `tfhe-rs`.
|
||||
- `asm`: Describes assembly-like language for the HPU. It enables abstract HPU behavior and easily updates it through micro-code.
|
||||
- `fw`: Abstraction to help the micro-code designer. Uses a simple rust program for describing new HPU operations. Helps with register/heap management.
|
||||
- `interface`:
|
||||
+ `device`: High-level structure that exposes the User API.
|
||||
+ `backend`: Inner private structure that contains HPU modules
|
||||
+ `variable`: Wraps HPU ciphertexts. It enables to hook an hardware object lifetime within the `rust` borrow-checker.
|
||||
+ `memory`: Handles on-board memory allocation and synchronization
|
||||
+ `config`: Helps to configure HPU accelerator through a TOML configuration file
|
||||
+ `cmd`: Translates operation over `variable` in concrete HPU commands
|
||||
+ `regmap`: Communicates with the HPU internal register with ease.
|
||||
+ `rtl`: Defines concrete `rust` structure populated from HPU's status/configuration registers
|
||||
|
||||
|
||||
Below is an overview of the internal structure of the Backend.
|
||||

|
||||
|
||||
This picture depicts the internal modules of `tfhe-hpu-backend`, Device is the main entry point for the user. Its lifecycle is as follows:
|
||||
|
||||
1. Create HpuDevice, open link with the associated FPGA. Configure associated drivers and upload the bitstream. Read FPGA registers to extract supported configuration and features. Build Firmware conversion table (IOp -> DOps stream).
|
||||
|
||||
2. Allocate required memory chunks in the on-board memory. Upload public material required by TFHE computation.
|
||||
|
||||
3. Create HPU variables that handle TFHE Ciphertexts. It wraps TFHE Ciphertext with required internal resources and enforces the correct lifetime management. This abstraction enforces that during the variable lifecycle all required resources are valid.
|
||||
|
||||
4. Users could trigger HPU operation from the HPU variable.
|
||||
Variable abstraction enforces that required objects are correctly synced on the hardware and converts each operation in a concrete HPU command.
|
||||
When HPU operation is acknowledged by the hardware, the internal state of the associated variable is updated.
|
||||
This mechanism enables asynchronous operation and minimal amount of Host to/from HW memory transfer.
|
||||
This mechanism also enables offloading a computation graph to the HPU and requires a synchronization only on the final results.
|
||||
|
||||
## Example
|
||||
### Configuration file
|
||||
HPU configuration knobs are gathered in a TOML configuration file. This file describes the targeted FPGA with its associated configuration:
|
||||
```toml
|
||||
[fpga] # FPGA target
|
||||
# Register layout in the FPGA
|
||||
regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml",
|
||||
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml",
|
||||
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml",
|
||||
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml"]
|
||||
polling_us=10
|
||||
[fpga.ffi.V80] # Hardware properties
|
||||
ami_dev="/dev/ami1" # Name of ami device
|
||||
qdma_h2c="/dev/qdma${V80_PCIE_DEV}001-MM-0" # QDma host to card device
|
||||
qdma_c2h="/dev/qdma${V80_PCIE_DEV}001-MM-1" # QDma card to host device
|
||||
|
||||
[rtl] # RTL option
|
||||
bpip_used = true # BPIP/IPIP mode
|
||||
bpip_use_opportunism = false # Use strict flush paradigm
|
||||
bpip_timeout = 100_000 # BPIP timeout in clock `cycles`
|
||||
|
||||
[board] # Board configuration
|
||||
ct_mem = 32768 # Number of allocated ciphertext
|
||||
ct_pc = [ # Memory used for ciphertext
|
||||
{Hbm= {pc=32}},
|
||||
{Hbm= {pc=33}},
|
||||
]
|
||||
heap_size = 16384 # Number of slots reserved for heap
|
||||
|
||||
lut_mem = 256 # Number of allocated LUT table
|
||||
lut_pc = {Hbm={pc=34}} # Memory used for LUT
|
||||
|
||||
fw_size= 16777216 # Size in byte of the Firmware translation table
|
||||
fw_pc = {Ddr= {offset= 0x3900_0000}} # Memory used for firmware translation table
|
||||
|
||||
bsk_pc = [ # Memory used for Bootstrapping key
|
||||
{Hbm={pc=8}},
|
||||
{Hbm={pc=12}},
|
||||
{Hbm={pc=24}},
|
||||
{Hbm={pc=28}},
|
||||
{Hbm={pc=40}},
|
||||
{Hbm={pc=44}},
|
||||
{Hbm={pc=56}},
|
||||
{Hbm={pc=60}}
|
||||
]
|
||||
|
||||
ksk_pc = [ # Memory used for Keyswitching key
|
||||
{Hbm={pc=0}},
|
||||
{Hbm={pc=1}},
|
||||
{Hbm={pc=2}},
|
||||
{Hbm={pc=3}},
|
||||
{Hbm={pc=4}},
|
||||
{Hbm={pc=5}},
|
||||
{Hbm={pc=6}},
|
||||
{Hbm={pc=7}},
|
||||
{Hbm={pc=16}},
|
||||
{Hbm={pc=17}},
|
||||
{Hbm={pc=18}},
|
||||
{Hbm={pc=19}},
|
||||
{Hbm={pc=20}},
|
||||
{Hbm={pc=21}},
|
||||
{Hbm={pc=22}},
|
||||
{Hbm={pc=23}}
|
||||
]
|
||||
|
||||
trace_pc = {Hbm={pc=35}} # Memory used for trace log
|
||||
trace_depth = 32 # Size of Memory in MiB allocated for trace log
|
||||
|
||||
[firmware] # Firmware properties
|
||||
implementation = "Llt" # Firmware flavor to use
|
||||
integer_w=[4,6,8,10,12,14,16,32,64,128] # List of supported IOp width
|
||||
min_batch_size = 11 # Minimum batch size for maximum throughput
|
||||
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
|
||||
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
|
||||
|
||||
# Default firmware configuration. Could be edited on per-IOp basis
|
||||
[firmware.op_cfg.default]
|
||||
fill_batch_fifo = true
|
||||
min_batch_size = false
|
||||
use_tiers = false
|
||||
flush_behaviour = "Patient"
|
||||
flush = true
|
||||
```
|
||||
|
||||
### Device setup
|
||||
Following code snippet shows how to instantiate and configure a `HpuDevice`:
|
||||
```rust
|
||||
// Following code snippets used the HighLevelApi abstraction
|
||||
// Instantiate HpuDevice --------------------------------------------------
|
||||
let hpu_device = HpuDevice::from_config(&args.config.expand());
|
||||
|
||||
// Generate keys ----------------------------------------------------------
|
||||
let config = Config::from_hpu_device(&hpu_device);
|
||||
|
||||
let cks = ClientKey::generate(config);
|
||||
let csks = CompressedServerKey::new(&cks);
|
||||
|
||||
// Register HpuDevice and key as thread-local engine
|
||||
set_server_key((hpu_device, csks));
|
||||
```
|
||||
|
||||
### Clone CPU ciphertext on HPU
|
||||
Following code snippet shows how to convert CPU ciphertext in HPU one:
|
||||
``` rust
|
||||
// Draw random value as input
|
||||
let a = rand::thread_rng().gen_range(0..u8::MAX);
|
||||
|
||||
// Encrypt them on Cpu side
|
||||
let a_fhe = FheUint8::encrypt(a, &cks);
|
||||
|
||||
// Clone a ciphertext and move them in HpuWorld
|
||||
// NB: Data doesn't move over Pcie at this stage
|
||||
// Data are only arranged in Hpu ordered an copy in the host internal buffer
|
||||
let a_hpu = a_fhe.clone_on(&hpu_device);
|
||||
```
|
||||
|
||||
### Dispatch operation on HPU
|
||||
Once registered as thread-local engine, HighLevel FheUint are converted in Hpu format.
|
||||
Following code snippets show how to start operation on HPU:
|
||||
|
||||
``` rust
|
||||
// Sum -------------------------------------------------------------
|
||||
// Generate random inputs value and compute expected result
|
||||
let in_a = rng.gen_range(0..u64::max_value());
|
||||
let in_b = rng.gen_range(0..u64::max_value());
|
||||
let clear_sum_ab = in_a.wrapping_add(in_b);
|
||||
|
||||
// Encrypt input value
|
||||
let fhe_a = FheUint64::encrypt(in_a, cks);
|
||||
let fhe_b = FheUint64::encrypt(in_b, cks);
|
||||
|
||||
// Triggered operation on HPU through hl_api
|
||||
let fhe_sum_ab = fhe_a+fhe_b;
|
||||
|
||||
// Decrypt values
|
||||
let dec_sum_ab: u64 = fhe_sum_ab.decrypt(cks);
|
||||
```
|
||||
|
||||
## Pre-made Examples
|
||||
There are some example applications already available in `tfhe/examples/hpu`:
|
||||
* hpu_hlapi: Depict the used of HPU device through HighLevelApi.
|
||||
* hpu_bench: Depict the used of HPU device through Integer abstraction level.
|
||||
|
||||
In order to run those applications on hardware, user must build from the project root (i.e `tfhe-rs-internal`) with `hpu-v80` features:
|
||||
|
||||
> NB: Running examples required to have correctly pulled the `.pdi` files. Those files, due to their size, are backed by git-lfs and disabled by default.
|
||||
> In order to retrieve them, use the following command:
|
||||
> ```bash
|
||||
> git lfs pull --include="*" --exclude=""
|
||||
> ```
|
||||
|
||||
``` bash
|
||||
cargo build --release --features="hpu-v80" --example hpu_hlapi --example hpu_bench
|
||||
# Correctly setup environment with setup_hpu.sh script
|
||||
source setup_hpu.sh --config v80 --init-qdma
|
||||
./target/release/examples/hpu_bench --integer-w 64 --integer-w 32 --iop MUL --iter 10
|
||||
./target/release/examples/hpu_hlapi
|
||||
```
|
||||
|
||||
## Test framework
|
||||
There is also a set of tests backed in tfhe-rs. Tests are gather in testbundle over various integer width.
|
||||
Those tests have 5 sub-kind:
|
||||
* `alu`: Run and check all ct x ct IOp
|
||||
* `alus`: Run and check all ct x scalar IOp
|
||||
* `bitwise`: Run and check all bitwise IOp
|
||||
* `cmp`: Run and check all comparison IOp
|
||||
* `ternary`: Run and check ternary operation
|
||||
* `algo`: Run and check IOp dedicated to offload small algorithms
|
||||
|
||||
|
||||
Snippets below give some example of command that could be used for testing:
|
||||
``` bash
|
||||
# Correctly setup environment with setup_hpu.sh script
|
||||
source setup_hpu.sh --config v80 --init-qdma
|
||||
|
||||
# Run all sub-kind for 64b integer width
|
||||
cargo test --release --features="hpu-v80" --test hpu -- u64
|
||||
|
||||
# Run only `bitwise` sub-kind for all integer width IOp
|
||||
cargo test --release --features="hpu-v80" --test hpu -- bitwise
|
||||
```
|
||||
|
||||
## Benches framework
|
||||
HPU is completely integrated in tfhe benchmark system. Performances results could be extracted from HighLevelApi or Integer Api.
|
||||
Three benchmarks could be started, through the following Makefile target for simplicity:
|
||||
``` bash
|
||||
# Do not forget to correctly set environment before hand
|
||||
source setup_hpu.sh --config v80 --init-qdma
|
||||
|
||||
# Run hlapi benches
|
||||
make test_high_level_api_hpu
|
||||
|
||||
# Run hlapi erc20 benches
|
||||
make bench_hlapi_erc20_hpu
|
||||
|
||||
# Run integer level benches
|
||||
make bench_integer_hpu
|
||||
```
|
||||
|
||||
## Eager to start without real Hardware ?
|
||||
You are still waiting your FPGA board and are frustrated by lead time ?
|
||||
Don't worry, you have backed-up. A dedicated simulation infrastructure with accurate performance estimation is available in tfhe-rs.
|
||||
You can use it on any linux/MacOs to test HPU integration within tfhe-rs and optimized your application for HPU target.
|
||||
Simply through an eye to [Hpu mockup](../../mockups/tfhe-hpu-mockup/README.md), and follow the instruction.
|
||||
26
backends/tfhe-hpu-backend/build.rs
Normal file
26
backends/tfhe-hpu-backend/build.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
fn main() {
|
||||
if cfg!(feature = "hw-xrt") {
|
||||
println!("cargo:rustc-link-search=/opt/xilinx/xrt/lib");
|
||||
println!("cargo:rustc-link-lib=dylib=stdc++");
|
||||
println!("cargo:rustc-link-lib=dl");
|
||||
println!("cargo:rustc-link-lib=rt");
|
||||
println!("cargo:rustc-link-lib=uuid");
|
||||
println!("cargo:rustc-link-lib=dylib=xrt_coreutil");
|
||||
|
||||
cxx_build::bridge("src/ffi/xrt/mod.rs")
|
||||
.file("src/ffi/xrt/cxx/hpu_hw.cc")
|
||||
.file("src/ffi/xrt/cxx/mem_zone.cc")
|
||||
.flag_if_supported("-std=c++23")
|
||||
.include("/opt/xilinx/xrt/include") // Enhance: support parsing bash env instead of hard path
|
||||
.flag("-fmessage-length=0")
|
||||
.compile("hpu-hw-ffi");
|
||||
|
||||
println!("cargo:rerun-if-changed=src/ffi/xrt/mod.rs");
|
||||
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.cc");
|
||||
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.h");
|
||||
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.cc");
|
||||
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.h");
|
||||
} else {
|
||||
// Simulation ffi -> nothing to do
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
# CUST_0
|
||||
# Simple IOp to check the xfer between Hpu/Cpu
|
||||
# Construct constant in dest slot -> 249 (0xf9)
|
||||
SUB R0 R0 R0
|
||||
ADDS R0 R0 1
|
||||
ST TD[0].0 R0
|
||||
SUB R1 R1 R1
|
||||
ADDS R1 R1 2
|
||||
ST TD[0].1 R1
|
||||
SUB R2 R2 R2
|
||||
ADDS R2 R2 3
|
||||
ST TD[0].2 R2
|
||||
SUB R3 R3 R3
|
||||
ADDS R3 R3 3
|
||||
ST TD[0].3 R3
|
||||
@@ -0,0 +1,11 @@
|
||||
# CUST_1
|
||||
# Simple IOp to check the xfer between Hpu/Cpu
|
||||
# Dest <- Src_a
|
||||
LD R0 TS[0].0
|
||||
LD R1 TS[0].1
|
||||
LD R2 TS[0].2
|
||||
LD R3 TS[0].3
|
||||
ST TD[0].0 R0
|
||||
ST TD[0].1 R1
|
||||
ST TD[0].2 R2
|
||||
ST TD[0].3 R3
|
||||
@@ -0,0 +1,25 @@
|
||||
; CUST_8
|
||||
; Simple IOp to check the ALU operation
|
||||
; Dst[0].0 <- Src[0].0 + Src[1].0
|
||||
LD R1 TS[0].0
|
||||
LD R2 TS[1].0
|
||||
ADD R0 R1 R2
|
||||
ST TD[0].0 R0
|
||||
|
||||
; Dst[0].1 <- Src[0].1 + Src[1].1
|
||||
LD R5 TS[0].1
|
||||
LD R6 TS[1].1
|
||||
ADD R4 R5 R6
|
||||
ST TD[0].2 R4
|
||||
|
||||
; Dst[0].2 <- Src[0].2 + Src[1].2
|
||||
LD R9 TS[0].2
|
||||
LD R10 TS[1].2
|
||||
ADD R8 R9 R10
|
||||
ST TD[0].2 R8
|
||||
|
||||
; Dst[0].3 <- Src[0].3 + Src[1].3
|
||||
LD R13 TS[0].3
|
||||
LD R14 TS[1].3
|
||||
ADD R12 R13 R14
|
||||
ST TD[0].3 R0
|
||||
@@ -0,0 +1,6 @@
|
||||
# CUST_16
|
||||
# Simple IOp to check PBS behavior
|
||||
# Dest <- PBSNone(Src_a.0)
|
||||
LD R0 TS[0].0
|
||||
PBS_F R0 R0 PbsNone
|
||||
ST TD[0].0 R0
|
||||
@@ -0,0 +1,15 @@
|
||||
# CUST_17
|
||||
# Simple IOp to check PBS behavior
|
||||
# Dest <- PBSNone(Src_a)
|
||||
LD R0 TS[0].0
|
||||
PBS R0 R0 PbsNone
|
||||
ST TD[0].0 R0
|
||||
LD R1 TS[0].1
|
||||
PBS R1 R1 PbsNone
|
||||
ST TD[0].1 R1
|
||||
LD R2 TS[0].2
|
||||
PBS R2 R2 PbsNone
|
||||
ST TD[0].2 R2
|
||||
LD R3 TS[0].3
|
||||
PBS_F R3 R3 PbsNone
|
||||
ST TD[0].3 R3
|
||||
@@ -0,0 +1,23 @@
|
||||
; CUST_18
|
||||
; Simple IOp to check extraction pattern
|
||||
; Correct result:
|
||||
; * Dst[0,1] <- Src[0][0,1]
|
||||
; * Dst[2,3] <- Src[1][0,1]
|
||||
|
||||
; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1]
|
||||
LD R0 TS[0].0
|
||||
LD R1 TS[0].1
|
||||
MAC R3 R1 R0 4
|
||||
PBS R4 R3 PbsMsgOnly
|
||||
PBS R5 R3 PbsCarryInMsg
|
||||
ST TD[0].0 R4
|
||||
ST TD[0].1 R5
|
||||
|
||||
; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3]
|
||||
LD R10 TS[1].0
|
||||
LD R11 TS[1].1
|
||||
MAC R13 R11 R10 4
|
||||
PBS R14 R13 PbsMsgOnly
|
||||
PBS R15 R13 PbsCarryInMsg
|
||||
ST TD[0].2 R14
|
||||
ST TD[0].3 R15
|
||||
@@ -0,0 +1,19 @@
|
||||
; CUST_19
|
||||
; Simple IOp to check PbsMl2
|
||||
; Correct result:
|
||||
; * Dst[0][0] <- Src[0][0]
|
||||
; * Dst[0][1] <- 0
|
||||
; * Dst[0][2] <- Src[0][0] +1
|
||||
; * Dst[0][3] <- 0
|
||||
; i.e Cust_19(0x2) => 0x32
|
||||
|
||||
; Construct a 0 for destination padding
|
||||
SUB R16 R16 R16
|
||||
|
||||
; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded)
|
||||
LD R0 TS[0].0
|
||||
PBS_ML2_F R0 R0 PbsTestMany2
|
||||
ST TD[0].0 R0
|
||||
ST TD[0].1 R16
|
||||
ST TD[0].2 R1
|
||||
ST TD[0].3 R16
|
||||
@@ -0,0 +1,11 @@
|
||||
# CUST_2
|
||||
# Simple IOp to check the xfer between Hpu/Cpu
|
||||
# Dest <- Src_b
|
||||
LD R0 TS[1].0
|
||||
LD R1 TS[1].1
|
||||
LD R2 TS[1].2
|
||||
LD R3 TS[1].3
|
||||
ST TD[0].0 R0
|
||||
ST TD[0].1 R1
|
||||
ST TD[0].2 R2
|
||||
ST TD[0].3 R3
|
||||
@@ -0,0 +1,22 @@
|
||||
; CUST_20
|
||||
; Simple IOp to check PbsMl4
|
||||
; Correct result:
|
||||
; * Dst[0][0] <- Src[0][0]
|
||||
; * Dst[0][1] <- Src[0][0] +1
|
||||
; * Dst[0][2] <- Src[0][0] +2
|
||||
; * Dst[0][3] <- Src[0][0] +3
|
||||
; i.e Cust_20(0x0) => 0xe4
|
||||
|
||||
SUB R16 R16 R16
|
||||
ST TD[0].0 R0
|
||||
ST TD[0].1 R0
|
||||
ST TD[0].2 R0
|
||||
ST TD[0].3 R0
|
||||
|
||||
; Apply PbsMl4 on Src[0] result goes in dest[0][0-3]
|
||||
LD R0 TS[0].0
|
||||
PBS_ML4_F R0 R0 PbsTestMany4
|
||||
ST TD[0].0 R0
|
||||
ST TD[0].1 R1
|
||||
ST TD[0].2 R2
|
||||
ST TD[0].3 R3
|
||||
@@ -0,0 +1,24 @@
|
||||
; CUST_21
|
||||
; Simple IOp to check PbsMl8
|
||||
; WARN: This operation required 16b ct width
|
||||
; Correct result:
|
||||
; * Dst[0][0] <- Src[0][0]
|
||||
; * Dst[0][1] <- Src[0][0] +1
|
||||
; * Dst[0][2] <- Src[0][0] +2
|
||||
; * Dst[0][3] <- Src[0][0] +3
|
||||
; * Dst[0][4] <- Src[0][0] +4
|
||||
; * Dst[0][5] <- Src[0][0] +5
|
||||
; * Dst[0][6] <- Src[0][0] +6
|
||||
; * Dst[0][7] <- Src[0][0] +7
|
||||
|
||||
; Apply PbsMl8 on Src[0] result goes in dest[0][0-7]
|
||||
LD R0 TS[0].0
|
||||
PBS_ML8_F R0 R0 PbsTestMany8
|
||||
ST TD[0].0 R0
|
||||
ST TD[0].1 R1
|
||||
ST TD[0].2 R2
|
||||
ST TD[0].3 R3
|
||||
ST TD[0].4 R4
|
||||
ST TD[0].5 R5
|
||||
ST TD[0].6 R6
|
||||
ST TD[0].7 R7
|
||||
@@ -0,0 +1,16 @@
|
||||
# CUST_3
|
||||
# Simple IOp to check isc behavior
|
||||
# Generate obvious deps and check that isc correctly issued the dop
|
||||
# Correct result must bu Dest <- Src[0]
|
||||
LD R0 TS[0].0
|
||||
LD R1 TS[0].1
|
||||
LD R2 TS[0].2
|
||||
LD R3 TS[0].3
|
||||
PBS R4 R0 PbsNone
|
||||
ST TD[0].0 R4
|
||||
PBS R4 R1 PbsNone
|
||||
ST TD[0].1 R4
|
||||
PBS R4 R2 PbsNone
|
||||
ST TD[0].2 R4
|
||||
PBS_F R4 R3 PbsNone
|
||||
ST TD[0].3 R4
|
||||
@@ -0,0 +1,19 @@
|
||||
; CUST_8
|
||||
; Simple IOp to check the ALU operation
|
||||
; Dst[0].0 <- Src[0].0 + Src[1].0
|
||||
LD R1 TS[0].0
|
||||
LD R2 TS[1].0
|
||||
ADD R0 R1 R2
|
||||
ST TD[0].0 R0
|
||||
|
||||
; Dst[0].1 <- Src[0].1 - Src[1].1
|
||||
LD R5 TS[0].1
|
||||
LD R6 TS[1].1
|
||||
SUB R4 R5 R6
|
||||
ST TD[0].1 R4
|
||||
|
||||
; Dst[0].2 <- Src[0].2 + (Src[1].2 *4)
|
||||
LD R9 TS[0].2
|
||||
LD R10 TS[1].2
|
||||
MAC R8 R9 R10 4
|
||||
ST TD[0].2 R8
|
||||
@@ -0,0 +1,21 @@
|
||||
; CUST_9
|
||||
; Simple IOp to check the ALU Scalar operation
|
||||
; Dst[0].0 <- Src[0].0 + Imm[0].0
|
||||
LD R1 TS[0].0
|
||||
ADDS R0 R1 TI[0].0
|
||||
ST TD[0].0 R0
|
||||
|
||||
; Dst[0].1 <- Src[0].1 - Imm[0].1
|
||||
LD R5 TS[0].1
|
||||
SUBS R4 R5 TI[0].1
|
||||
ST TD[0].1 R4
|
||||
|
||||
; Dst[0].2 <- Imm[0].2 - Src[0].2
|
||||
LD R9 TS[0].2
|
||||
SSUB R8 R9 TI[0].2
|
||||
ST TD[0].2 R8
|
||||
|
||||
; Dst[0].3 <- Src[0].3 * Imm[0].3
|
||||
LD R13 TS[0].3
|
||||
MULS R12 R13 TI[0].3
|
||||
ST TD[0].3 R12
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user