mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
2 Commits
bench/gpu/
...
gpu/hypers
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c721b0fdaf | ||
|
|
470667507d |
24
.github/workflows/aws_tfhe_fast_tests.yml
vendored
24
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -18,8 +18,8 @@ on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (fast-tests)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (fast-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -37,21 +37,21 @@ jobs:
|
||||
|
||||
fast-tests:
|
||||
name: Fast CPU tests
|
||||
needs: setup-instance
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -115,10 +115,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (fast-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, fast-tests ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (fast-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, fast-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -129,7 +129,7 @@ jobs:
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -137,4 +137,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
@@ -29,10 +29,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
90
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
90
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
@@ -18,8 +18,8 @@ on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-tests)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cuda-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -30,18 +30,18 @@ jobs:
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: gpu-test
|
||||
|
||||
cuda-pcc:
|
||||
name: CUDA post-commit checks
|
||||
needs: setup-instance
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
@@ -55,14 +55,14 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -93,64 +93,6 @@ jobs:
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests
|
||||
needs: [ setup-instance, cuda-pcc ]
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
run: |
|
||||
make test_gpu
|
||||
@@ -175,10 +117,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-pcc, cuda-tests-linux ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -187,9 +129,9 @@ jobs:
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -197,4 +139,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
24
.github/workflows/aws_tfhe_integer_tests.yml
vendored
24
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -18,8 +18,8 @@ on:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (unsigned-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -38,21 +38,21 @@ jobs:
|
||||
|
||||
unsigned-integer-tests:
|
||||
name: Unsigned integer tests
|
||||
needs: setup-instance
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -80,10 +80,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, unsigned-integer-tests ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, unsigned-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -94,7 +94,7 @@ jobs:
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -102,4 +102,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
@@ -18,8 +18,8 @@ on:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-tests)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (signed-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -38,21 +38,21 @@ jobs:
|
||||
|
||||
signed-integer-tests:
|
||||
name: Signed integer tests
|
||||
needs: setup-instance
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -84,10 +84,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, signed-integer-tests ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, signed-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -98,7 +98,7 @@ jobs:
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -106,4 +106,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
24
.github/workflows/aws_tfhe_tests.yml
vendored
24
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -18,8 +18,8 @@ on:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cpu-tests)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cpu-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -38,21 +38,21 @@ jobs:
|
||||
|
||||
cpu-tests:
|
||||
name: CPU tests
|
||||
needs: setup-instance
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -110,10 +110,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cpu-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cpu-tests ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cpu-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cpu-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -132,4 +132,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
24
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
24
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -18,8 +18,8 @@ on:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (wasm-tests)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (wasm-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -38,21 +38,21 @@ jobs:
|
||||
|
||||
wasm-tests:
|
||||
name: WASM tests
|
||||
needs: setup-instance
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -80,10 +80,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (wasm-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, wasm-tests ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (wasm-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, wasm-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -94,7 +94,7 @@ jobs:
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -102,4 +102,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
8
.github/workflows/boolean_benchmark.yml
vendored
8
.github/workflows/boolean_benchmark.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -97,13 +97,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
2
.github/workflows/cargo_build.yml
vendored
2
.github/workflows/cargo_build.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
|
||||
2
.github/workflows/ci_lint.yml
vendored
2
.github/workflows/ci_lint.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Get actionlint
|
||||
run: |
|
||||
|
||||
10
.github/workflows/code_coverage.yml
vendored
10
.github/workflows/code_coverage.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
@@ -63,13 +63,13 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
|
||||
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@5ecb98a3c6b747ed38dc09f787459979aebb39be
|
||||
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@@ -113,7 +113,7 @@ jobs:
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@5ecb98a3c6b747ed38dc09f787459979aebb39be
|
||||
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
8
.github/workflows/core_crypto_benchmark.yml
vendored
8
.github/workflows/core_crypto_benchmark.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -88,13 +88,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
100
.github/workflows/core_crypto_gpu_benchmark.yml
vendored
100
.github/workflows/core_crypto_gpu_benchmark.yml
vendored
@@ -1,24 +1,18 @@
|
||||
# Run core crypto benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Core crypto GPU benchmarks
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-core-crypto-benchmarks)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cuda-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -29,15 +23,15 @@ jobs:
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
profile: gpu-bench
|
||||
|
||||
cuda-core-crypto-benchmarks:
|
||||
name: Execute GPU core crypto benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
core-crypto-benchmarks:
|
||||
name: CUDA core crypto benchmarks
|
||||
needs: setup-ec2
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
@@ -49,21 +43,12 @@ jobs:
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install ca-certificates curl
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||
echo \
|
||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -71,26 +56,22 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -122,26 +103,28 @@ jobs:
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--name-suffix avx512 \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -161,18 +144,23 @@ jobs:
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# FIXME This action needs docker to be installed on the machine beforehand.
|
||||
# - name: Slack Notification
|
||||
# if: ${{ failure() }}
|
||||
# continue-on-error: true
|
||||
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
# env:
|
||||
# SLACK_COLOR: ${{ job.status }}
|
||||
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
# SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-core-crypto-benchmarks ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-benchmarks)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, core-crypto-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -181,9 +169,9 @@ jobs:
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -191,4 +179,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (cuda-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
24
.github/workflows/csprng_randomness_tests.yml
vendored
24
.github/workflows/csprng_randomness_tests.yml
vendored
@@ -19,8 +19,8 @@ on:
|
||||
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (csprng-randomness-tests)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (csprng-randomness-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -39,21 +39,21 @@ jobs:
|
||||
|
||||
csprng-randomness-tests:
|
||||
name: CSPRNG randomness tests
|
||||
needs: setup-instance
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -69,10 +69,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (csprng-randomness-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, csprng-randomness-tests ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (csprng-randomness-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, csprng-randomness-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -91,4 +91,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
18
.github/workflows/gpu_4090_full_benchmark.yml
vendored
18
.github/workflows/gpu_4090_full_benchmark.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -52,12 +52,12 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -120,7 +120,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -133,12 +133,12 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -163,7 +163,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -194,7 +194,7 @@ jobs:
|
||||
name: Remove 4090 bench label
|
||||
if: ${{ always() && github.event_name == 'pull_request' }}
|
||||
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
steps:
|
||||
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
|
||||
10
.github/workflows/integer_benchmark.yml
vendored
10
.github/workflows/integer_benchmark.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
8
.github/workflows/integer_full_benchmark.yml
vendored
8
.github/workflows/integer_full_benchmark.yml
vendored
@@ -74,7 +74,7 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -92,12 +92,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
106
.github/workflows/integer_gpu_benchmark.yml
vendored
106
.github/workflows/integer_gpu_benchmark.yml
vendored
@@ -1,11 +1,10 @@
|
||||
# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU benchmarks
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -14,14 +13,10 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-benchmarks)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cuda-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -32,15 +27,15 @@ jobs:
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
profile: gpu-bench
|
||||
|
||||
cuda-integer-benchmarks:
|
||||
name: Execute GPU integer benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
name: CUDA integer benchmarks
|
||||
needs: setup-ec2
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
@@ -52,21 +47,12 @@ jobs:
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install ca-certificates curl
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||
echo \
|
||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -74,26 +60,22 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -127,33 +109,35 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--hardware "n2-H100x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -173,23 +157,23 @@ jobs:
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-benchmarks]
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# FIXME This action needs docker to be installed on the machine beforehand.
|
||||
# - name: Slack Notification
|
||||
# if: ${{ !success() && !cancelled() }}
|
||||
# continue-on-error: true
|
||||
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
# env:
|
||||
# SLACK_COLOR: ${{ job.status }}
|
||||
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
# SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-benchmarks ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-benchmarks)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cuda-integer-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -198,9 +182,9 @@ jobs:
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -208,4 +192,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (cuda-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
81
.github/workflows/integer_gpu_full_benchmark.yml
vendored
81
.github/workflows/integer_gpu_full_benchmark.yml
vendored
@@ -1,11 +1,10 @@
|
||||
# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU full benchmarks
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -13,14 +12,10 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-full-benchmarks)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cuda-full-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -31,15 +26,15 @@ jobs:
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
profile: gpu-bench
|
||||
|
||||
cuda-integer-full-benchmarks:
|
||||
name: Execute GPU integer benchmarks for all operations flavor
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
name: CUDA integer full benchmarks
|
||||
needs: setup-ec2
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
@@ -56,21 +51,12 @@ jobs:
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install ca-certificates curl
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||
echo \
|
||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -79,7 +65,7 @@ jobs:
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -97,7 +83,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -122,7 +108,7 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -136,7 +122,7 @@ jobs:
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--hardware "n2-H100x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
@@ -147,7 +133,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -166,18 +152,23 @@ jobs:
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# FIXME This action needs docker to be installed on the machine beforehand.
|
||||
# - name: Slack Notification
|
||||
# if: ${{ !success() && !cancelled() }}
|
||||
# continue-on-error: true
|
||||
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
# env:
|
||||
# SLACK_COLOR: ${{ job.status }}
|
||||
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
# SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-full-benchmarks ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-full-benchmarks)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cuda-integer-full-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -186,9 +177,9 @@ jobs:
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -196,4 +187,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (cuda-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
|
||||
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Integer GPU Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -14,14 +13,10 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-multi-bit-benchmarks)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cuda-multi-bit-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -32,15 +27,15 @@ jobs:
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
profile: gpu-bench
|
||||
|
||||
cuda-integer-multi-bit-benchmarks:
|
||||
name: Execute GPU integer multi-bit benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
name: CUDA integer multi-bit benchmarks
|
||||
needs: setup-ec2
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -53,21 +48,12 @@ jobs:
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install ca-certificates curl
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||
echo \
|
||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -75,26 +61,22 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -128,33 +110,35 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--hardware "n2-H100x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -174,18 +158,23 @@ jobs:
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# FIXME This action needs docker to be installed on the machine beforehand.
|
||||
# - name: Slack Notification
|
||||
# if: ${{ !success() && !cancelled() }}
|
||||
# continue-on-error: true
|
||||
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
# env:
|
||||
# SLACK_COLOR: ${{ job.status }}
|
||||
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
# SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-multi-bit-benchmarks)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cuda-integer-multi-bit-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -194,9 +183,9 @@ jobs:
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -204,4 +193,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (cuda-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
11
.github/workflows/m1_tests.yml
vendored
11
.github/workflows/m1_tests.yml
vendored
@@ -31,10 +31,10 @@ jobs:
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -86,13 +86,6 @@ jobs:
|
||||
run: |
|
||||
make test_boolean
|
||||
|
||||
# Because we do "illegal" things with the build system which Cargo does not seem to like much
|
||||
# we need to clear the cache to make sure the C API is built properly and does not use a stale
|
||||
# cached version
|
||||
- name: Clear build cache
|
||||
run: |
|
||||
cargo clean
|
||||
|
||||
- name: Run C API tests
|
||||
run: |
|
||||
make test_c_api
|
||||
|
||||
2
.github/workflows/make_release.yml
vendored
2
.github/workflows/make_release.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
24
.github/workflows/make_release_cuda.yml
vendored
24
.github/workflows/make_release_cuda.yml
vendored
@@ -21,8 +21,8 @@ env:
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (publish-cuda-release)
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (publish-cuda-release)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -40,8 +40,8 @@ jobs:
|
||||
|
||||
publish-cuda-release:
|
||||
name: Publish CUDA Release
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
needs: setup-ec2
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -104,10 +104,10 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (publish-release)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, publish-cuda-release ]
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (publish-release)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, publish-cuda-release ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
@@ -118,7 +118,7 @@ jobs:
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -126,4 +126,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "EC2 teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
4
.github/workflows/parameters_check.yml
vendored
4
.github/workflows/parameters_check.yml
vendored
@@ -17,10 +17,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
|
||||
- name: Checkout lattice-estimator
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: malb/lattice-estimator
|
||||
path: lattice_estimator
|
||||
|
||||
8
.github/workflows/shortint_benchmark.yml
vendored
8
.github/workflows/shortint_benchmark.yml
vendored
@@ -45,7 +45,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -89,13 +89,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -71,12 +71,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -115,7 +115,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
10
.github/workflows/signed_integer_benchmark.yml
vendored
10
.github/workflows/signed_integer_benchmark.yml
vendored
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
@@ -52,7 +52,7 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -70,12 +70,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
13
.github/workflows/start_benchmarks.yml
vendored
13
.github/workflows/start_benchmarks.yml
vendored
@@ -36,6 +36,10 @@ on:
|
||||
description: "Run core crypto benches"
|
||||
type: boolean
|
||||
default: true
|
||||
core_crypto_gpu_bench:
|
||||
description: "Run core crypto benches on GPU"
|
||||
type: boolean
|
||||
default: true
|
||||
wasm_client_bench:
|
||||
description: "Run WASM client benches"
|
||||
type: boolean
|
||||
@@ -49,17 +53,18 @@ jobs:
|
||||
command: [ boolean_bench, shortint_bench,
|
||||
integer_bench, integer_multi_bit_bench,
|
||||
signed_integer_bench, signed_integer_multi_bit_bench,
|
||||
core_crypto_bench, wasm_client_bench ]
|
||||
integer_gpu_bench, integer_multi_bit_gpu_bench,
|
||||
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
|
||||
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
|
||||
with:
|
||||
files_yaml: |
|
||||
common_benches:
|
||||
@@ -106,7 +111,7 @@ jobs:
|
||||
- .github/workflows/wasm_client_benchmark.yml
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
8
.github/workflows/start_full_benchmarks.yml
vendored
8
.github/workflows/start_full_benchmarks.yml
vendored
@@ -25,17 +25,17 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
command: [ boolean_bench, shortint_full_bench,
|
||||
integer_full_bench, signed_integer_full_bench,
|
||||
core_crypto_bench, wasm_client_bench ]
|
||||
integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
|
||||
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
7
.github/workflows/sync_on_push.yml
vendored
7
.github/workflows/sync_on_push.yml
vendored
@@ -13,9 +13,14 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Save repo
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: repo-archive
|
||||
path: '.'
|
||||
- name: git-sync
|
||||
uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
|
||||
with:
|
||||
|
||||
8
.github/workflows/wasm_client_benchmark.yml
vendored
8
.github/workflows/wasm_client_benchmark.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -98,13 +98,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
|
||||
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
48
Makefile
48
Makefile
@@ -160,12 +160,6 @@ check_nvm_installed:
|
||||
@source ~/.nvm/nvm.sh && nvm --version > /dev/null 2>&1 || \
|
||||
( echo "Unable to locate Node. Run 'make install_node'" && exit 1 )
|
||||
|
||||
.PHONY: install_mlc # Install mlc (Markup Link Checker)
|
||||
install_mlc: install_rs_build_toolchain
|
||||
@mlc --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install mlc --locked || \
|
||||
( echo "Unable to install mlc, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: fmt # Format rust code
|
||||
fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
|
||||
@@ -277,7 +271,7 @@ clippy_js_wasm_api: install_rs_check_toolchain
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
|
||||
clippy_tasks: install_rs_check_toolchain
|
||||
clippy_tasks:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
-p tasks -- --no-deps -D warnings
|
||||
|
||||
@@ -287,19 +281,19 @@ clippy_trivium: install_rs_check_toolchain
|
||||
-p tfhe-trivium -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
|
||||
clippy_all_targets: install_rs_check_toolchain
|
||||
clippy_all_targets:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
|
||||
clippy_concrete_csprng: install_rs_check_toolchain
|
||||
clippy_concrete_csprng:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE) \
|
||||
-p concrete-csprng -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
|
||||
clippy_zk_pok: install_rs_check_toolchain
|
||||
clippy_zk_pok:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-zk-pok -- --no-deps -D warnings
|
||||
|
||||
@@ -382,7 +376,7 @@ build_c_api_gpu: install_rs_check_toolchain
|
||||
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
|
||||
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
|
||||
-p $(TFHE_SPEC)
|
||||
@"$(MAKE)" symlink_c_libs_without_fingerprint
|
||||
|
||||
@@ -450,14 +444,14 @@ test_cuda_backend:
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
|
||||
|
||||
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_core_crypto_gpu: install_rs_build_toolchain
|
||||
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
|
||||
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
|
||||
test_integer_gpu: install_rs_build_toolchain
|
||||
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
@@ -485,14 +479,14 @@ test_c_api_rs: install_rs_check_toolchain
|
||||
|
||||
.PHONY: test_c_api_c # Run the C tests for the C API
|
||||
test_c_api_c: build_c_api
|
||||
./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"
|
||||
./scripts/c_api_tests.sh
|
||||
|
||||
.PHONY: test_c_api # Run all the tests for the C API
|
||||
test_c_api: test_c_api_rs test_c_api_c
|
||||
|
||||
.PHONY: test_c_api_gpu # Run the C tests for the C API
|
||||
test_c_api_gpu: build_c_api_gpu
|
||||
./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"
|
||||
./scripts/c_api_tests.sh --gpu
|
||||
|
||||
.PHONY: test_shortint_ci # Run the tests for shortint ci
|
||||
test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
|
||||
@@ -644,12 +638,12 @@ test_kreyvium: install_rs_build_toolchain
|
||||
-p tfhe-trivium -- --test-threads=1 kreyvium::
|
||||
|
||||
.PHONY: test_concrete_csprng # Run concrete-csprng tests
|
||||
test_concrete_csprng: install_rs_build_toolchain
|
||||
test_concrete_csprng:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
|
||||
|
||||
.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
|
||||
test_zk_pok: install_rs_build_toolchain
|
||||
test_zk_pok:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok
|
||||
|
||||
@@ -688,23 +682,15 @@ format_doc_latex:
|
||||
check_md_docs_are_tested:
|
||||
RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
|
||||
|
||||
.PHONY: check_intra_md_links # Checks broken internal links in Markdown docs
|
||||
check_intra_md_links: install_mlc
|
||||
mlc --offline --match-file-extension tfhe/docs
|
||||
|
||||
.PHONY: check_md_links # Checks all broken links in Markdown docs
|
||||
check_md_links: install_mlc
|
||||
mlc --match-file-extension tfhe/docs
|
||||
|
||||
.PHONY: check_compile_tests # Build tests in debug without running them
|
||||
check_compile_tests: install_rs_build_toolchain
|
||||
check_compile_tests:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
|
||||
"$(MAKE)" build_c_api && \
|
||||
./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
|
||||
./scripts/c_api_tests.sh --build-only; \
|
||||
fi
|
||||
|
||||
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
|
||||
@@ -827,6 +813,8 @@ bench_oprf: install_rs_check_toolchain
|
||||
--bench oprf-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
|
||||
|
||||
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
|
||||
bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
@@ -835,6 +823,7 @@ bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
--bench shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
|
||||
.PHONY: bench_boolean # Run benchmarks for boolean
|
||||
bench_boolean: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
@@ -878,7 +867,6 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
#
|
||||
# Utility tools
|
||||
#
|
||||
|
||||
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
|
||||
gen_key_cache: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
@@ -954,8 +942,8 @@ sha256_bool: install_rs_check_toolchain
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean
|
||||
|
||||
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
|
||||
clippy_all check_compile_tests
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_all \
|
||||
check_compile_tests
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
|
||||
|
||||
@@ -71,7 +71,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let config = ConfigBuilder::default().build();
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -143,7 +143,7 @@ use tfhe::prelude::*;
|
||||
use tfhe_trivium::TriviumStreamShortint;
|
||||
|
||||
fn test_shortint() {
|
||||
let config = ConfigBuilder::default().build();
|
||||
let config = ConfigBuilder::all_disabled().enable_default_integers().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
|
||||
let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));
|
||||
|
||||
@@ -13,7 +13,6 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
|
||||
|
||||
[build-dependencies]
|
||||
cmake = { version = "0.1" }
|
||||
pkg-config = { version = "0.3" }
|
||||
|
||||
[dependencies]
|
||||
thiserror = "1.0"
|
||||
|
||||
@@ -21,15 +21,7 @@ fn main() {
|
||||
let dest = cmake::build("cuda");
|
||||
println!("cargo:rustc-link-search=native={}", dest.display());
|
||||
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
|
||||
|
||||
// Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
|
||||
if pkg_config::Config::new()
|
||||
.atleast_version("10")
|
||||
.probe("cuda")
|
||||
.is_err()
|
||||
{
|
||||
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
|
||||
}
|
||||
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
|
||||
println!("cargo:rustc-link-lib=gomp");
|
||||
println!("cargo:rustc-link-lib=cudart");
|
||||
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
@@ -4,14 +4,14 @@
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
|
||||
void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
void *dest, void *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
|
||||
void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
void *dest, void *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
};
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define check_cuda_error(ans) \
|
||||
@@ -26,33 +27,47 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
|
||||
std::abort(); \
|
||||
}
|
||||
|
||||
cudaStream_t cuda_create_stream(uint32_t gpu_index);
|
||||
struct cuda_stream_t {
|
||||
cudaStream_t stream;
|
||||
uint32_t gpu_index;
|
||||
|
||||
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
|
||||
cuda_stream_t(uint32_t gpu_index) {
|
||||
this->gpu_index = gpu_index;
|
||||
|
||||
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
|
||||
check_cuda_error(cudaStreamCreate(&stream));
|
||||
}
|
||||
|
||||
void release() {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
|
||||
};
|
||||
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
|
||||
|
||||
void cuda_destroy_stream(cuda_stream_t *stream);
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
|
||||
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
bool cuda_check_support_thread_block_clusters();
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
@@ -60,18 +75,20 @@ void cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
void cuda_drop_async(void *ptr, cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
|
||||
void cuda_synchronize_stream(cuda_stream_t *stream);
|
||||
|
||||
void cuda_stream_add_callback(cuda_stream_t *stream,
|
||||
cudaStreamCallback_t callback, void *user_data);
|
||||
}
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
void *host_pointer);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *d_array, Torus value, Torus n);
|
||||
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
|
||||
Torus n);
|
||||
#endif
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
#ifndef HELPER_H
|
||||
#define HELPER_H
|
||||
|
||||
extern "C" {
|
||||
int cuda_setup_multi_gpu();
|
||||
}
|
||||
|
||||
void multi_gpu_checks(uint32_t gpu_count);
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,16 +6,16 @@
|
||||
extern "C" {
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -7,42 +7,42 @@
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -5,89 +5,80 @@
|
||||
#include <cstdint>
|
||||
|
||||
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1 };
|
||||
|
||||
extern "C" {
|
||||
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
void *input1, void *input2, void *output,
|
||||
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials);
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_32(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_programmable_bootstrap(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
@@ -120,28 +111,6 @@ get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // tbc
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
@@ -156,11 +125,6 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
@@ -171,14 +135,13 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
|
||||
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
@@ -210,17 +173,17 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
level_count * (glwe_dimension + 1);
|
||||
}
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
} break;
|
||||
case PBS_VARIANT::CG: {
|
||||
uint64_t full_sm =
|
||||
@@ -243,73 +206,25 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
}
|
||||
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
} break;
|
||||
#if CUDA_ARCH >= 900
|
||||
case PBS_VARIANT::TBC: {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
|
||||
// There is a minimum amount of memory we need to run the TBC PBS, which
|
||||
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
|
||||
// because otherwise the previous check would have redirected
|
||||
// computation to some other variant. If over that we don't have more
|
||||
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
|
||||
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
|
||||
//
|
||||
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
|
||||
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
} break;
|
||||
#endif
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
|
||||
void release(cuda_stream_t *stream) {
|
||||
cuda_drop_async(d_mem, stream);
|
||||
cuda_drop_async(global_accumulator_fft, stream);
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator, stream);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -348,9 +263,9 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
@@ -358,54 +273,28 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
|
||||
@@ -11,98 +11,59 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor);
|
||||
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t chunk_size = 0);
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t chunk_size = 0);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void scratch_cuda_generic_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void cuda_generic_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -111,7 +72,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
@@ -119,9 +80,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -152,25 +113,12 @@ template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
int8_t *d_mem_acc_step_one = NULL;
|
||||
int8_t *d_mem_acc_step_two = NULL;
|
||||
int8_t *d_mem_acc_cg = NULL;
|
||||
int8_t *d_mem_acc_tbc = NULL;
|
||||
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
@@ -178,27 +126,25 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
|
||||
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
|
||||
this->pbs_variant = pbs_variant;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
// default
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
|
||||
Torus>(polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
Torus>(polynomial_size);
|
||||
// cg
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_cg_accumulate =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -216,124 +162,80 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
auto num_blocks_acc_cg =
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
uint64_t full_sm_tbc_accumulate =
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_tbc_accumulate =
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
auto num_blocks_acc_tbc = num_blocks_acc_cg;
|
||||
#endif
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
// Keybundle
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
d_mem_keybundle = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_keybundle * full_sm_keybundle, stream, gpu_index);
|
||||
num_blocks_keybundle * full_sm_keybundle, stream);
|
||||
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::CG:
|
||||
// Accumulator CG
|
||||
if (max_shared_memory < partial_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index);
|
||||
else if (max_shared_memory < full_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
case DEFAULT:
|
||||
// Accumulator step one
|
||||
if (max_shared_memory < partial_sm_accumulate_step_one)
|
||||
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
|
||||
gpu_index);
|
||||
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream);
|
||||
else if (max_shared_memory < full_sm_accumulate_step_one)
|
||||
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
|
||||
gpu_index);
|
||||
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream);
|
||||
|
||||
// Accumulator step two
|
||||
if (max_shared_memory < full_sm_accumulate_step_two)
|
||||
d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
|
||||
gpu_index);
|
||||
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream);
|
||||
break;
|
||||
#if CUDA_ARCH >= 900
|
||||
case TBC:
|
||||
// There is a minimum amount of memory we need to run the TBC PBS, which
|
||||
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
|
||||
// because otherwise the previous check would have redirected
|
||||
// computation to some other variant. If over that we don't have more
|
||||
// partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
|
||||
// partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
|
||||
// will run on PARTIALSM. Otherwise, FULLSM.
|
||||
//
|
||||
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
|
||||
|
||||
// Accumulator TBC
|
||||
if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
|
||||
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index);
|
||||
else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
|
||||
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream,
|
||||
gpu_index);
|
||||
case CG:
|
||||
// Accumulator CG
|
||||
if (max_shared_memory < partial_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * full_sm_cg_accumulate, stream);
|
||||
else if (max_shared_memory < full_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
|
||||
num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
|
||||
keybundle_fft = (double2 *)cuda_malloc_async(
|
||||
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
num_blocks_acc_step_two * polynomial_size * sizeof(Torus), stream);
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t stream, uint32_t gpu_index) {
|
||||
void release(cuda_stream_t *stream) {
|
||||
|
||||
if (d_mem_keybundle)
|
||||
cuda_drop_async(d_mem_keybundle, stream, gpu_index);
|
||||
cuda_drop_async(d_mem_keybundle, stream);
|
||||
switch (pbs_variant) {
|
||||
case DEFAULT:
|
||||
if (d_mem_acc_step_one)
|
||||
cuda_drop_async(d_mem_acc_step_one, stream, gpu_index);
|
||||
cuda_drop_async(d_mem_acc_step_one, stream);
|
||||
if (d_mem_acc_step_two)
|
||||
cuda_drop_async(d_mem_acc_step_two, stream, gpu_index);
|
||||
cuda_drop_async(d_mem_acc_step_two, stream);
|
||||
break;
|
||||
case CG:
|
||||
if (d_mem_acc_cg)
|
||||
cuda_drop_async(d_mem_acc_cg, stream, gpu_index);
|
||||
cuda_drop_async(d_mem_acc_cg, stream);
|
||||
break;
|
||||
#if CUDA_ARCH >= 900
|
||||
case TBC:
|
||||
if (d_mem_acc_tbc)
|
||||
cuda_drop_async(d_mem_acc_tbc, stream, gpu_index);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
|
||||
cuda_drop_async(keybundle_fft, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
|
||||
cuda_drop_async(keybundle_fft, stream);
|
||||
cuda_drop_async(global_accumulator, stream);
|
||||
cuda_drop_async(global_accumulator_fft, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t max_shared_memory);
|
||||
#ifdef __CUDACC__
|
||||
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count);
|
||||
|
||||
#endif
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -10,8 +10,7 @@ set(SOURCES
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper.h)
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
|
||||
@@ -1,21 +1 @@
|
||||
#include "ciphertext.cuh"
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void *dest, void *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
|
||||
(uint64_t *)src, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void *dest, void *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
|
||||
(uint64_t *)src, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
@@ -6,23 +6,39 @@
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
|
||||
uint32_t gpu_index, T *dest,
|
||||
T *src, uint32_t number_of_cts,
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(dest, src, size, stream);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
|
||||
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
|
||||
uint32_t gpu_index, T *dest,
|
||||
T *src, uint32_t number_of_cts,
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
|
||||
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -49,15 +49,11 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
|
||||
* global memory
|
||||
*/
|
||||
template <typename T, typename ST, class params>
|
||||
void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, double2 *dest, T *src,
|
||||
void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
|
||||
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t max_shared_memory) {
|
||||
if (gpu_count != 1)
|
||||
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
|
||||
"supported yet.")
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
uint32_t gpu_index, uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
@@ -66,11 +62,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
if (max_shared_memory < shared_memory_size) {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
|
||||
<<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
|
||||
} else {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
|
||||
d_mem);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
|
||||
d_mem);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -6,13 +6,12 @@
|
||||
* Head out to the equivalent operation on 64 bits for more details.
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
|
||||
@@ -36,13 +35,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
|
||||
@@ -98,12 +98,12 @@ keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
|
||||
/// assume lwe_array_in in the gpu
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
int lwe_size = lwe_dimension_out + 1;
|
||||
@@ -124,14 +124,13 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
|
||||
int shared_mem = sizeof(Torus) * lwe_size;
|
||||
|
||||
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream,
|
||||
gpu_index);
|
||||
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
dim3 grid(num_samples, 1, 1);
|
||||
dim3 threads(ideal_threads, 1, 1);
|
||||
|
||||
keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
|
||||
keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
|
||||
lwe_upper, cutoff);
|
||||
|
||||
@@ -3,23 +3,14 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
cudaStream_t cuda_create_stream(uint32_t gpu_index) {
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cudaStream_t stream;
|
||||
check_cuda_error(cudaStreamCreate(&stream));
|
||||
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaStreamSynchronize(stream));
|
||||
}
|
||||
void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
|
||||
|
||||
/// Unsafe function that will try to allocate even if gpu_index is invalid
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
@@ -34,22 +25,20 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
void *ptr;
|
||||
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
|
||||
cudaDevAttrMemoryPoolsSupported,
|
||||
stream->gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream));
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
|
||||
} else {
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
}
|
||||
@@ -82,60 +71,46 @@ bool cuda_check_support_cooperative_groups() {
|
||||
return cooperative_groups_supported > 0;
|
||||
}
|
||||
|
||||
/// Returns
|
||||
/// false if Thread Block Cluster is not supported.
|
||||
/// true otherwise
|
||||
bool cuda_check_support_thread_block_clusters() {
|
||||
#if CUDA_ARCH >= 900
|
||||
// To-do: Is this really the best way to check support?
|
||||
int tbc_supported = 0;
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
|
||||
|
||||
return tbc_supported > 0;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU asynchronously
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
if (attr_dest.device != gpu_index && attr_dest.type != cudaMemoryTypeDevice) {
|
||||
if (attr_dest.device != stream->gpu_index &&
|
||||
attr_dest.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
cudaPointerAttributes attr_src;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
|
||||
if (attr_src.device != gpu_index && attr_src.type != cudaMemoryTypeDevice) {
|
||||
if (attr_src.device != stream->gpu_index &&
|
||||
attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
if (attr_src.device != attr_dest.device) {
|
||||
PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
|
||||
stream->stream));
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
@@ -145,16 +120,16 @@ void cuda_synchronize_device(uint32_t gpu_index) {
|
||||
}
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -165,45 +140,42 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *d_array, Torus value, Torus n) {
|
||||
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
|
||||
Torus n) {
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
|
||||
n);
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
|
||||
n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
|
||||
template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint64_t *d_array, uint64_t value,
|
||||
uint64_t n);
|
||||
template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t *d_array, uint32_t value,
|
||||
uint32_t n);
|
||||
template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
|
||||
uint64_t value, uint64_t n);
|
||||
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
|
||||
uint32_t value, uint32_t n);
|
||||
|
||||
/// Copy memory to the CPU asynchronously
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, src));
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
|
||||
}
|
||||
|
||||
/// Return number of GPUs available
|
||||
@@ -220,18 +192,19 @@ void cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
}
|
||||
|
||||
/// Drop a cuda array asynchronously, if supported on the device
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
|
||||
cudaDevAttrMemoryPoolsSupported,
|
||||
stream->gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaFreeAsync(ptr, stream));
|
||||
check_cuda_error(cudaFreeAsync(ptr, stream->stream));
|
||||
} else {
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
}
|
||||
@@ -250,11 +223,13 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
|
||||
void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
|
||||
|
||||
void cuda_stream_add_callback(cuda_stream_t *stream,
|
||||
cudaStreamCallback_t callback, void *user_data) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
|
||||
check_cuda_error(
|
||||
cudaStreamAddCallback(stream->stream, callback, user_data, 0));
|
||||
}
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -15,19 +15,17 @@ void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_bitop_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
|
||||
op_type, allocate_gpu_memory);
|
||||
stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
|
||||
params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
|
||||
void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
|
||||
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2),
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
@@ -35,22 +33,19 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
}
|
||||
|
||||
void cuda_bitnot_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_bitnot_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(void *stream, uint32_t gpu_index,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
|
||||
|
||||
int_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -13,8 +13,7 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
@@ -22,32 +21,32 @@ host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
|
||||
bsk, ksk, num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_bitnot_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer<Torus> *mem_ptr,
|
||||
void *bsk, Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk,
|
||||
stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
|
||||
num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_bitop_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index, int_bitop_buffer<Torus> **mem_ptr,
|
||||
cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(stream, gpu_index, op, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "integer/cmux.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -17,20 +17,17 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
scratch_cuda_integer_radix_cmux_kb(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
|
||||
void *lwe_array_false, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
|
||||
void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_condition),
|
||||
static_cast<uint64_t *>(lwe_array_true),
|
||||
static_cast<uint64_t *>(lwe_array_false),
|
||||
@@ -39,10 +36,10 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_cmux_buffer<uint64_t> *mem_ptr =
|
||||
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -5,13 +5,12 @@
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_input, Torus *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
int big_lwe_size = params.big_lwe_dimension + 1;
|
||||
@@ -28,7 +27,8 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
|
||||
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
|
||||
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
|
||||
lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
|
||||
params.message_modulus, 1);
|
||||
@@ -36,23 +36,23 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsk,
|
||||
ksk, num_radix_blocks, predicate);
|
||||
stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
|
||||
predicate);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_cmux_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
|
||||
Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
__host__ void
|
||||
host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_condition, Torus *lwe_array_true,
|
||||
Torus *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// Since our CPU threads will be working on different streams we shall assert
|
||||
// the work in the main stream is completed
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
stream->synchronize();
|
||||
auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
|
||||
auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
|
||||
|
||||
@@ -62,43 +62,41 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if(&true_stream, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsk, ksk, num_radix_blocks);
|
||||
zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
|
||||
lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
|
||||
ksk, num_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if(&false_stream, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
|
||||
lwe_array_false, lwe_condition, mem_false,
|
||||
mem_ptr->predicate_lut, bsk, ksk, num_radix_blocks);
|
||||
zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
|
||||
lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(true_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(false_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(true_stream);
|
||||
cuda_synchronize_stream(false_stream);
|
||||
|
||||
// If the condition was true, true_ct will have kept its value and false_ct
|
||||
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
|
||||
// have kept its value
|
||||
auto added_cts = mem_ptr->tmp_true_ct;
|
||||
host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
|
||||
mem_ptr->tmp_false_ct, params.big_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
|
||||
params.big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsk, ksk,
|
||||
num_radix_blocks, mem_ptr->message_extract_lut);
|
||||
stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
|
||||
mem_ptr->message_extract_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_cmux_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index, int_cmux_buffer<Torus> **mem_ptr,
|
||||
cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
|
||||
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
*mem_ptr =
|
||||
new int_cmux_buffer<Torus>(stream, gpu_index, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "integer/comparison.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -18,9 +18,8 @@ void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
case EQ:
|
||||
case NE:
|
||||
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
op_type, false, allocate_gpu_memory);
|
||||
stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
|
||||
params, op_type, false, allocate_gpu_memory);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -29,17 +28,16 @@ void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
case MAX:
|
||||
case MIN:
|
||||
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
op_type, is_signed, allocate_gpu_memory);
|
||||
stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
|
||||
params, op_type, is_signed, allocate_gpu_memory);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
|
||||
void *bsk, void *ksk, uint32_t num_radix_blocks) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
|
||||
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
@@ -47,8 +45,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_radix_blocks);
|
||||
@@ -58,8 +55,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
case LT:
|
||||
case LE:
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer,
|
||||
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
|
||||
@@ -68,8 +64,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
case MAX:
|
||||
case MIN:
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_radix_blocks);
|
||||
@@ -79,10 +74,10 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_comparison(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *mem_ptr =
|
||||
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -33,17 +33,16 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *output, Torus *input,
|
||||
uint32_t lwe_dimension,
|
||||
__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
|
||||
Torus *input, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
// Add all blocks and store in sum
|
||||
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
|
||||
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
output, input, lwe_dimension, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -57,13 +56,12 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -78,10 +76,9 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
tmp_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
|
||||
@@ -95,8 +92,8 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto input_blocks = tmp_out;
|
||||
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension, chunk_length);
|
||||
accumulate_all_blocks(stream, accumulator, input_blocks,
|
||||
big_lwe_dimension, chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
@@ -118,18 +115,16 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
lut = (*is_equal_to_num_blocks_map)[chunk_length];
|
||||
} else {
|
||||
// LUT needs to be computed
|
||||
auto new_lut =
|
||||
new int_radix_lut<Torus>(streams[0], gpu_indexes[0], params,
|
||||
max_value, num_radix_blocks, true);
|
||||
auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
|
||||
num_radix_blocks, true);
|
||||
|
||||
auto is_equal_to_num_blocks_lut_f = [max_value,
|
||||
chunk_length](Torus x) -> Torus {
|
||||
return (x & max_value) == chunk_length;
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], new_lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
is_equal_to_num_blocks_lut_f);
|
||||
stream, new_lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
|
||||
|
||||
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
|
||||
lut = new_lut;
|
||||
@@ -140,13 +135,11 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
|
||||
1, lut);
|
||||
stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsk, ksk,
|
||||
num_chunks, lut);
|
||||
stream, tmp_out, accumulator, bsk, ksk, num_chunks, lut);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -159,12 +152,9 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void is_at_least_one_comparisons_block_true(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -175,10 +165,9 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
while (remaining_blocks > 0) {
|
||||
@@ -191,8 +180,8 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
auto input_blocks = mem_ptr->tmp_lwe_array_out;
|
||||
auto accumulator = buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension, chunk_length);
|
||||
accumulate_all_blocks(stream, accumulator, input_blocks,
|
||||
big_lwe_dimension, chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
@@ -207,13 +196,12 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
|
||||
1, lut);
|
||||
stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
accumulator, bsk, ksk, num_chunks, lut);
|
||||
stream, mem_ptr->tmp_lwe_array_out, accumulator, bsk, ksk, num_chunks,
|
||||
lut);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -239,12 +227,11 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
// are_all_comparisons_block_true
|
||||
template <typename Torus>
|
||||
__host__ void host_compare_with_zero_equality(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -269,8 +256,7 @@ __host__ void host_compare_with_zero_equality(
|
||||
|
||||
if (num_radix_blocks == 1) {
|
||||
// Just copy
|
||||
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
|
||||
num_sum_blocks = 1;
|
||||
} else {
|
||||
uint32_t remainder_blocks = num_radix_blocks;
|
||||
@@ -280,8 +266,8 @@ __host__ void host_compare_with_zero_equality(
|
||||
uint32_t chunk_size =
|
||||
std::min(remainder_blocks, num_elements_to_fill_carry);
|
||||
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
|
||||
big_lwe_dimension, chunk_size);
|
||||
accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
|
||||
chunk_size);
|
||||
|
||||
num_sum_blocks++;
|
||||
remainder_blocks -= (chunk_size - 1);
|
||||
@@ -293,46 +279,41 @@ __host__ void host_compare_with_zero_equality(
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sum, sum, bsk, ksk, num_sum_blocks,
|
||||
zero_comparison);
|
||||
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
sum, mem_ptr, bsk, ksk, num_sum_blocks);
|
||||
stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
|
||||
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
|
||||
num_sum_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_equality_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto eq_buffer = mem_ptr->eq_buffer;
|
||||
|
||||
// Applies the LUT for the comparison operation
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
|
||||
bsk, ksk, num_radix_blocks, eq_buffer->operator_lut);
|
||||
stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
|
||||
eq_buffer->operator_lut);
|
||||
|
||||
// This takes a Vec of blocks, where each block is either 0 or 1.
|
||||
//
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr, bsk, ksk,
|
||||
num_radix_blocks);
|
||||
are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
|
||||
bsk, ksk, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -353,21 +334,21 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, big_lwe_dimension, num_radix_blocks);
|
||||
host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsk, ksk,
|
||||
num_radix_blocks, is_non_zero_lut);
|
||||
stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
|
||||
is_non_zero_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
|
||||
big_lwe_dimension, num_radix_blocks,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
@@ -375,14 +356,13 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// final sign
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = tree_buffer->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -401,19 +381,16 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto y = tree_buffer->tmp_y;
|
||||
if (x != lwe_block_comparisons)
|
||||
cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
|
||||
big_lwe_size_bytes * num_radix_blocks,
|
||||
streams[0], gpu_indexes[0]);
|
||||
big_lwe_size_bytes * num_radix_blocks, stream);
|
||||
|
||||
uint32_t partial_block_count = num_radix_blocks;
|
||||
|
||||
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
||||
while (partial_block_count > 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, x, y, bsk, ksk,
|
||||
partial_block_count >> 1, inner_tree_leaf);
|
||||
stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
|
||||
|
||||
if ((partial_block_count % 2) != 0) {
|
||||
partial_block_count >>= 1;
|
||||
@@ -423,8 +400,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
|
||||
big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
big_lwe_size_bytes, stream);
|
||||
} else {
|
||||
partial_block_count >>= 1;
|
||||
}
|
||||
@@ -435,8 +411,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
std::function<Torus(Torus)> f;
|
||||
|
||||
if (partial_block_count == 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
|
||||
|
||||
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
@@ -450,24 +425,23 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
y = x;
|
||||
f = sign_handler_f;
|
||||
}
|
||||
generate_device_accumulator<Torus>(streams[0], gpu_indexes[0], last_lut->lut,
|
||||
glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f);
|
||||
generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, f);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsk, ksk, 1, last_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
|
||||
ksk, 1, last_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
|
||||
Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
@@ -489,21 +463,21 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (mem_ptr->is_signed) {
|
||||
packed_num_radix_blocks -= 2;
|
||||
}
|
||||
pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
|
||||
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
|
||||
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
|
||||
packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
|
||||
packed_num_radix_blocks, message_modulus);
|
||||
// From this point we have half number of blocks
|
||||
packed_num_radix_blocks /= 2;
|
||||
|
||||
// Clean noise
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsk, ksk,
|
||||
packed_num_radix_blocks, identity_lut);
|
||||
stream, packed_left, packed_left, bsk, ksk, packed_num_radix_blocks,
|
||||
identity_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, packed_right, packed_right, bsk, ksk,
|
||||
packed_num_radix_blocks, identity_lut);
|
||||
stream, packed_right, packed_right, bsk, ksk, packed_num_radix_blocks,
|
||||
identity_lut);
|
||||
|
||||
lhs = packed_left;
|
||||
rhs = packed_right;
|
||||
@@ -518,15 +492,15 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (!mem_ptr->is_signed) {
|
||||
// Compare packed blocks, or simply the total number of radix blocks in the
|
||||
// inputs
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
|
||||
rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
|
||||
packed_num_radix_blocks);
|
||||
num_comparisons = packed_num_radix_blocks;
|
||||
} else {
|
||||
// Packing is possible
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
|
||||
rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
|
||||
packed_num_radix_blocks);
|
||||
|
||||
// Compare the last block before the sign block separately
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
@@ -536,35 +510,32 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
diff_buffer->tmp_packed_right +
|
||||
packed_num_radix_blocks * big_lwe_size;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
|
||||
stream, last_left_block_before_sign_block,
|
||||
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
|
||||
identity_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
|
||||
stream, last_right_block_before_sign_block,
|
||||
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
|
||||
identity_lut);
|
||||
compare_radix_blocks_kb(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + packed_num_radix_blocks * big_lwe_size,
|
||||
stream, comparisons + packed_num_radix_blocks * big_lwe_size,
|
||||
last_left_block_before_sign_block, last_right_block_before_sign_block,
|
||||
mem_ptr, bsk, ksk, 1);
|
||||
// Compare the sign block separately
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
|
||||
stream, comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
|
||||
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
|
||||
lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
|
||||
mem_ptr->signed_lut);
|
||||
num_comparisons = packed_num_radix_blocks + 2;
|
||||
|
||||
} else {
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, bsk,
|
||||
ksk, num_radix_blocks - 1);
|
||||
compare_radix_blocks_kb(stream, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsk, ksk,
|
||||
num_radix_blocks - 1);
|
||||
// Compare the sign block separately
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + (num_radix_blocks - 1) * big_lwe_size,
|
||||
stream, comparisons + (num_radix_blocks - 1) * big_lwe_size,
|
||||
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
|
||||
lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
|
||||
mem_ptr->signed_lut);
|
||||
@@ -575,44 +546,39 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsk, ksk, num_comparisons);
|
||||
tree_sign_reduction(stream, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
|
||||
ksk, num_comparisons);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_comparison_check_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, COMPARISON_TYPE op, bool is_signed,
|
||||
bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool is_signed, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(stream, gpu_index, op, params,
|
||||
num_radix_blocks, is_signed,
|
||||
allocate_gpu_memory);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
stream, op, params, num_radix_blocks, is_signed, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t total_num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsk,
|
||||
ksk, total_num_radix_blocks);
|
||||
stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb(
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
#include "integer/div_rem.cuh"
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
#ifdef BENCH_SCRATCH_LEVEL_1
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, static_cast<cudaStream_t>(stream));
|
||||
#endif
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_div_rem_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
|
||||
#ifdef BENCH_SCRATCH_LEVEL_1
|
||||
cudaEventRecord(stop, static_cast<cudaStream_t>(stream));
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
printf("Time for scratch operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
|
||||
void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t num_blocks) {
|
||||
|
||||
auto stream_array = (cudaStream_t *)(streams);
|
||||
auto cur_stream = stream_array[0];
|
||||
#ifdef BENCH_HOST_LEVEL_1
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, cur_stream);
|
||||
#endif
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
|
||||
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
|
||||
}
|
||||
|
||||
#ifdef BENCH_HOST_LEVEL_1
|
||||
cudaEventRecord(stop, cur_stream);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
printf("Time for host operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *stream, uint32_t gpu_index,
|
||||
int8_t **mem_ptr_void) {
|
||||
#ifdef BENCH_DROP_LEVEL_1
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, static_cast<cudaStream_t>(stream));
|
||||
#endif
|
||||
int_div_rem_memory<uint64_t> *mem_ptr =
|
||||
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
|
||||
#ifdef BENCH_DROP_LEVEL_1
|
||||
cudaEventRecord(stop, static_cast<cudaStream_t>(stream));
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
printf("Time for drop operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
}
|
||||
@@ -1,965 +0,0 @@
|
||||
#ifndef TFHE_RS_DIV_REM_CUH
|
||||
#define TFHE_RS_DIV_REM_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define BENCH_SCRATCH_LEVEL_1
|
||||
#define BENCH_HOST_LEVEL_1
|
||||
#define BENCH_DROP_LEVEL_1
|
||||
|
||||
#define BENCH_HOST_PHASE_0_LEVEL_2
|
||||
#define BENCH_HOST_PHASE_1_LEVEL_3
|
||||
#define BENCH_HOST_PHASE_2_LEVEL_3
|
||||
#define BENCH_HOST_PHASE_3_LEVEL_3
|
||||
#define BENCH_HOST_PHASE_4_LEVEL_2
|
||||
#define BENCH_LEVEL_4
|
||||
#define BENCH_OVERFLOW_SUM_LEVEL_2
|
||||
|
||||
float total_phase_1 = 0;
|
||||
float total_phase_2 = 0;
|
||||
float total_phase_3 = 0;
|
||||
float total_phase_overflow = 0;
|
||||
|
||||
float total_phase_1_1 = 0;
|
||||
float total_phase_1_2 = 0;
|
||||
float total_phase_1_3 = 0;
|
||||
float total_phase_1_4 = 0;
|
||||
|
||||
float total_phase_2_1 = 0;
|
||||
float total_phase_2_2 = 0;
|
||||
float total_phase_2_3 = 0;
|
||||
float total_phase_2_4 = 0;
|
||||
|
||||
float total_phase_3_1 = 0;
|
||||
float total_phase_3_2 = 0;
|
||||
float total_phase_3_3 = 0;
|
||||
float total_phase_3_4 = 0;
|
||||
|
||||
|
||||
int ceil_div(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
// struct makes it easier to use list of ciphertexts and move data between them
|
||||
// struct does not allocate or drop any memory,
|
||||
// keeps track on number of ciphertexts inside list.
|
||||
template <typename Torus> struct lwe_ciphertext_list {
|
||||
Torus *data;
|
||||
size_t max_blocks;
|
||||
size_t len;
|
||||
int_radix_params params;
|
||||
|
||||
size_t big_lwe_size;
|
||||
size_t radix_size;
|
||||
size_t big_lwe_size_bytes;
|
||||
size_t radix_size_bytes;
|
||||
size_t big_lwe_dimension;
|
||||
|
||||
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
|
||||
: data(src), params(params), max_blocks(max_blocks) {
|
||||
big_lwe_size = params.big_lwe_dimension + 1;
|
||||
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
radix_size = max_blocks * big_lwe_size;
|
||||
radix_size_bytes = radix_size * sizeof(Torus);
|
||||
big_lwe_dimension = params.big_lwe_dimension;
|
||||
len = max_blocks;
|
||||
}
|
||||
|
||||
// copies ciphertexts from Torus*, starting from `starting_block` including
|
||||
// `finish_block`, does not change the value of self len
|
||||
void copy_from(Torus *src, size_t start_block, size_t finish_block,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
size_t tmp_len = finish_block - start_block + 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
|
||||
tmp_len * big_lwe_size_bytes, stream,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
// copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
|
||||
// including `finish_block`, does not change the value of self len
|
||||
void copy_from(const lwe_ciphertext_list &src, size_t start_block,
|
||||
size_t finish_block, cudaStream_t stream, uint32_t gpu_index) {
|
||||
copy_from(src.data, start_block, finish_block, stream, gpu_index);
|
||||
}
|
||||
|
||||
// copies ciphertexts from Torus*, starting from `starting_block`
|
||||
// including `finish_block`, updating the value of self len
|
||||
void clone_from(Torus *src, size_t start_block, size_t finish_block,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
len = finish_block - start_block + 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
|
||||
len * big_lwe_size_bytes, stream, gpu_index);
|
||||
}
|
||||
|
||||
// copies ciphertexts from ciphertexts_list, starting from `starting_block`
|
||||
// including `finish_block`, updating the value of self len
|
||||
void clone_from(const lwe_ciphertext_list &src, size_t start_block,
|
||||
size_t finish_block, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
clone_from(src.data, start_block, finish_block, stream, gpu_index);
|
||||
}
|
||||
|
||||
// assign zero to blocks starting from `start_block` including `finish_block`
|
||||
void assign_zero(size_t start_block, size_t finish_block, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
auto size = finish_block - start_block + 1;
|
||||
cuda_memset_async(&data[start_block * big_lwe_size], 0,
|
||||
size * big_lwe_size_bytes, stream, gpu_index);
|
||||
}
|
||||
|
||||
// return pointer to last block
|
||||
Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
|
||||
|
||||
// return pointer to first_block
|
||||
Torus *first_block() { return data; }
|
||||
|
||||
// return block with `index`
|
||||
Torus *get_block(size_t index) {
|
||||
assert(index < len);
|
||||
return &data[index * big_lwe_size];
|
||||
}
|
||||
|
||||
bool is_empty() { return len == 0; }
|
||||
|
||||
// does not dop actual memory from `data`, only reduces value of `len` by one
|
||||
void pop() {
|
||||
if (len > 0)
|
||||
len--;
|
||||
else
|
||||
assert(len > 0);
|
||||
}
|
||||
|
||||
// insert ciphertext at index `ind`
|
||||
void insert(size_t ind, Torus *ciphertext_block, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
assert(ind <= len);
|
||||
assert(len < max_blocks);
|
||||
|
||||
size_t insert_offset = ind * big_lwe_size;
|
||||
|
||||
for (size_t i = len; i > ind; i--) {
|
||||
Torus *src = &data[(i - 1) * big_lwe_size];
|
||||
Torus *dst = &data[i * big_lwe_size];
|
||||
cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
|
||||
big_lwe_size_bytes, stream, gpu_index);
|
||||
len++;
|
||||
}
|
||||
|
||||
// push ciphertext at the end of `data`
|
||||
void push(Torus *ciphertext_block, cudaStream_t stream, uint32_t gpu_index) {
|
||||
assert(len < max_blocks);
|
||||
|
||||
size_t offset = len * big_lwe_size;
|
||||
cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
|
||||
big_lwe_size_bytes, stream, gpu_index);
|
||||
len++;
|
||||
}
|
||||
|
||||
// duplicate ciphertext into `number_of_blocks` ciphertexts
|
||||
void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
assert(number_of_blocks <= max_blocks);
|
||||
|
||||
for (size_t i = 0; i < number_of_blocks; i++) {
|
||||
Torus *dest = &data[i * big_lwe_size];
|
||||
cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes, stream,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
len = number_of_blocks;
|
||||
}
|
||||
|
||||
// used for debugging, prints body of each ciphertext.
|
||||
void print_blocks_body(const char *name) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
scratch_cuda_integer_div_rem_kb(cudaStream_t stream, uint32_t gpu_index,
|
||||
int_div_rem_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
*mem_ptr = new int_div_rem_memory<Torus>(stream, gpu_index, params,
|
||||
num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void
|
||||
host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *quotient, Torus *remainder,
|
||||
Torus *numerator, Torus *divisor, void *bsk,
|
||||
uint64_t *ksk, int_div_rem_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
total_phase_1 = 0;
|
||||
total_phase_2 = 0;
|
||||
total_phase_3 = 0;
|
||||
total_phase_overflow = 0;
|
||||
|
||||
total_phase_1_1 = 0;
|
||||
total_phase_1_2 = 0;
|
||||
total_phase_1_3 = 0;
|
||||
total_phase_1_4 = 0;
|
||||
|
||||
total_phase_2_1 = 0;
|
||||
total_phase_2_2 = 0;
|
||||
total_phase_2_3 = 0;
|
||||
|
||||
total_phase_3_1 = 0;
|
||||
total_phase_3_2 = 0;
|
||||
total_phase_3_3 = 0;
|
||||
total_phase_3_4 = 0;
|
||||
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
float milliseconds = 0;
|
||||
#ifdef BENCH_HOST_PHASE_0_LEVEL_2
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
auto big_lwe_dimension = radix_params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
uint32_t message_modulus = radix_params.message_modulus;
|
||||
uint32_t carry_modulus = radix_params.carry_modulus;
|
||||
uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
|
||||
uint32_t total_bits = num_bits_in_message * num_blocks;
|
||||
|
||||
// put temporary buffers in lwe_ciphertext_list for easy use
|
||||
lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
|
||||
num_blocks);
|
||||
lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
|
||||
num_blocks);
|
||||
lwe_ciphertext_list<Torus> numerator_block_stack(
|
||||
mem_ptr->numerator_block_stack, radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
|
||||
radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
|
||||
num_blocks + 1);
|
||||
lwe_ciphertext_list<Torus> interesting_remainder1(
|
||||
mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
|
||||
lwe_ciphertext_list<Torus> interesting_remainder2(
|
||||
mem_ptr->interesting_remainder2, radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
|
||||
radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
|
||||
radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
|
||||
num_blocks);
|
||||
lwe_ciphertext_list<Torus> subtraction_overflowed(
|
||||
mem_ptr->subtraction_overflowed, radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
|
||||
radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
|
||||
1);
|
||||
lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
|
||||
radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
|
||||
mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
|
||||
lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
|
||||
mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
|
||||
|
||||
numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
|
||||
remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
#ifdef BENCH_HOST_PHASE_0_LEVEL_2
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
printf("--Time for phase_0 operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
|
||||
for (int i = total_bits - 1; i >= 0; i--) {
|
||||
#ifdef BENCH_HOST_PHASE_1_LEVEL_3
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
uint32_t block_of_bit = i / num_bits_in_message;
|
||||
uint32_t pos_in_block = i % num_bits_in_message;
|
||||
uint32_t msb_bit_set = total_bits - 1 - i;
|
||||
uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
|
||||
|
||||
// Index to the first block of the remainder that is fully trivial 0
|
||||
// and all blocks after it are also trivial zeros
|
||||
// This number is in range 1..=num_bocks -1
|
||||
uint32_t first_trivial_block = last_non_trivial_block + 1;
|
||||
|
||||
interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
|
||||
streams[0], gpu_indexes[0]);
|
||||
interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
|
||||
streams[0], gpu_indexes[0]);
|
||||
interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
|
||||
streams[0], gpu_indexes[0]);
|
||||
divisor_ms_blocks.clone_from(divisor,
|
||||
(msb_bit_set + 1) / num_bits_in_message,
|
||||
num_blocks - 1, streams[0], gpu_indexes[0]);
|
||||
|
||||
// We split the divisor at a block position, when in reality the split
|
||||
// should be at a bit position meaning that potentially (depending on
|
||||
// msb_bit_set) the split versions share some bits they should not. So we do
|
||||
// one PBS on the last block of the interesting_divisor, and first block of
|
||||
// divisor_ms_blocks to trim out bits which should not be there
|
||||
auto trim_last_interesting_divisor_bits =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count)
|
||||
{
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
|
||||
return;
|
||||
}
|
||||
// The last block of the interesting part of the remainder
|
||||
// can contain bits which we should not account for
|
||||
// we have to zero them out.
|
||||
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to keep only first bit (right
|
||||
// shift
|
||||
// mask by 1) if pos_in_block is 1, then we want to keep the two
|
||||
// bits
|
||||
// (right shift mask by 0)
|
||||
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
|
||||
|
||||
// Create mask of 1s on the message part, 0s in the carries
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
|
||||
// Shift the mask so that we will only keep bits we should
|
||||
uint32_t shifted_mask = full_message_mask >> shift_amount;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
|
||||
interesting_divisor.last_block(), bsk, ksk, 1,
|
||||
mem_ptr->masking_luts_1[shifted_mask]);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_1_1 += milliseconds;
|
||||
//printf("Time for scratch operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
|
||||
auto trim_first_divisor_ms_bits =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
if (divisor_ms_blocks.is_empty() ||
|
||||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
|
||||
return;
|
||||
}
|
||||
// Where the msb is set in the block
|
||||
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
|
||||
|
||||
// e.g 2 bits in message:
|
||||
// if pos_in_block is 0, then we want to discard the first bit (left
|
||||
// shift mask by 1) if pos_in_block is 1, then we want to discard the
|
||||
// two bits (left shift mask by 2) let shift_amount =
|
||||
// num_bits_in_message - pos_in_block
|
||||
uint32_t shift_amount = pos_in_block + 1;
|
||||
uint32_t full_message_mask = message_modulus - 1;
|
||||
uint32_t shifted_mask = full_message_mask << shift_amount;
|
||||
|
||||
// Keep the mask within the range of message bits, so that
|
||||
// the estimated degree of the output is < msg_modulus
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
|
||||
divisor_ms_blocks.first_block(), bsk, ksk, 1,
|
||||
mem_ptr->masking_luts_2[shifted_mask]);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_1_2 += milliseconds;
|
||||
#endif
|
||||
}; // trim_first_divisor_ms_bits
|
||||
|
||||
// This does
|
||||
// R := R << 1; R(0) := N(i)
|
||||
//
|
||||
// We could to that by left shifting, R by one, then unchecked_add the
|
||||
// correct numerator bit.
|
||||
//
|
||||
// However, to keep the remainder clean (noise wise), what we do is that we
|
||||
// put the remainder block from which we need to extract the bit, as the LSB
|
||||
// of the Remainder, so that left shifting will pull the bit we need.
|
||||
auto left_shift_interesting_remainder1 =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
numerator_block_1.clone_from(
|
||||
numerator_block_stack, numerator_block_stack.len - 1,
|
||||
numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
|
||||
numerator_block_stack.pop();
|
||||
interesting_remainder1.insert(0, numerator_block_1.first_block(),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
|
||||
mem_ptr->shift_mem_1, bsk, ksk, interesting_remainder1.len);
|
||||
|
||||
tmp_radix.clone_from(interesting_remainder1, 0,
|
||||
interesting_remainder1.len - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
radix_blocks_rotate_left<<<interesting_remainder1.len, 256, 0,
|
||||
streams[0]>>>(
|
||||
interesting_remainder1.data, tmp_radix.data, 1,
|
||||
interesting_remainder1.len, big_lwe_size);
|
||||
|
||||
numerator_block_1.clone_from(
|
||||
interesting_remainder1, interesting_remainder1.len - 1,
|
||||
interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
|
||||
|
||||
interesting_remainder1.pop();
|
||||
|
||||
if (pos_in_block != 0) {
|
||||
// We have not yet extracted all the bits from this numerator
|
||||
// so, we put it back on the front so that it gets taken next
|
||||
// iteration
|
||||
numerator_block_stack.push(numerator_block_1.first_block(),
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_1_3 += milliseconds;
|
||||
#endif
|
||||
}; // left_shift_interesting_remainder1
|
||||
|
||||
auto left_shift_interesting_remainder2 =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
|
||||
mem_ptr->shift_mem_2, bsk, ksk, interesting_remainder2.len);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_1_4 += milliseconds;
|
||||
#endif
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(&mem_ptr->sub_stream_1,
|
||||
&gpu_indexes[0], 1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
left_shift_interesting_remainder1(&mem_ptr->sub_stream_3,
|
||||
&gpu_indexes[0], 1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(&mem_ptr->sub_stream_4,
|
||||
&gpu_indexes[0], 1);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_4, gpu_indexes[0]);
|
||||
|
||||
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
|
||||
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
|
||||
// In practice interesting_remainder1 contains the numerator bit,
|
||||
// but in that position, interesting_remainder2 always has a 0
|
||||
auto &merged_interesting_remainder = interesting_remainder1;
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
|
||||
merged_interesting_remainder.data,
|
||||
interesting_remainder2.data, radix_params.big_lwe_dimension,
|
||||
merged_interesting_remainder.len);
|
||||
|
||||
// after create_clean_version_of_merged_remainder
|
||||
// `merged_interesting_remainder` will be reused as
|
||||
// `cleaned_merged_interesting_remainder`
|
||||
cleaned_merged_interesting_remainder.clone_from(
|
||||
merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
assert(merged_interesting_remainder.len == interesting_divisor.len);
|
||||
|
||||
// `new_remainder` is not initialized yet, so need to set length
|
||||
new_remainder.len = merged_interesting_remainder.len;
|
||||
|
||||
#ifdef BENCH_HOST_PHASE_1_LEVEL_3
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_1 += milliseconds;
|
||||
//printf("----Time for phase_1 operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
#ifdef BENCH_HOST_PHASE_2_LEVEL_3
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
|
||||
// fills:
|
||||
// `new_remainder` - radix ciphertext
|
||||
// `subtraction_overflowed` - single ciphertext
|
||||
auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
host_integer_overflowing_sub_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder.data,
|
||||
subtraction_overflowed.data, merged_interesting_remainder.data,
|
||||
interesting_divisor.data, bsk, ksk, mem_ptr->overflow_sub_mem,
|
||||
merged_interesting_remainder.len);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_2_1 += milliseconds;
|
||||
#endif
|
||||
};
|
||||
|
||||
// fills:
|
||||
// `at_least_one_upper_block_is_non_zero` - single ciphertext
|
||||
auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
auto &trivial_blocks = divisor_ms_blocks;
|
||||
if (trivial_blocks.is_empty()) {
|
||||
cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
} else {
|
||||
|
||||
// We could call unchecked_scalar_ne
|
||||
// But we are in the special case where scalar == 0
|
||||
// So we can skip some stuff
|
||||
host_compare_with_zero_equality(
|
||||
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
|
||||
mem_ptr->comparison_buffer, bsk, ksk, trivial_blocks.len,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
|
||||
tmp_1.len =
|
||||
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
|
||||
|
||||
is_at_least_one_comparisons_block_true(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
|
||||
mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len);
|
||||
}
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_2_2 += milliseconds;
|
||||
#endif
|
||||
};
|
||||
|
||||
// Creates a cleaned version (noise wise) of the merged remainder
|
||||
// so that it can be safely used in bivariate PBSes
|
||||
// fills:
|
||||
// `cleaned_merged_interesting_remainder` - radix ciphertext
|
||||
auto create_clean_version_of_merged_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
cleaned_merged_interesting_remainder.data, bsk, ksk,
|
||||
cleaned_merged_interesting_remainder.len,
|
||||
mem_ptr->message_extract_lut_1);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_2_3 += milliseconds;
|
||||
#endif
|
||||
};
|
||||
|
||||
// phase 2
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
do_overflowing_sub(&mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
check_divisor_upper_blocks(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
create_clean_version_of_merged_remainder(&mem_ptr->sub_stream_3,
|
||||
&gpu_indexes[0], 1);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
|
||||
|
||||
#ifdef BENCH_HOST_PHASE_2_LEVEL_3
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_2 += milliseconds;
|
||||
// printf("----Time for phase_2 operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
|
||||
#ifdef BENCH_OVERFLOW_SUM_LEVEL_2
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
|
||||
subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
int factor = (i) ? 3 : 2;
|
||||
int factor_lut_id = factor - 2;
|
||||
overflow_sum_radix.fill_with_same_ciphertext(
|
||||
overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
#ifdef BENCH_OVERFLOW_SUM_LEVEL_2
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_overflow += milliseconds;
|
||||
// printf("----Time for phase_2 operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
#ifdef BENCH_HOST_PHASE_3_LEVEL_3
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
auto conditionally_zero_out_merged_interesting_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
overflow_sum_radix.data, bsk, ksk,
|
||||
cleaned_merged_interesting_remainder.len,
|
||||
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
factor);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_3_1 += milliseconds;
|
||||
#endif
|
||||
};
|
||||
|
||||
auto conditionally_zero_out_merged_new_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder.data,
|
||||
new_remainder.data, overflow_sum_radix.data, bsk, ksk,
|
||||
new_remainder.len,
|
||||
mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_3_2 += milliseconds;
|
||||
#endif
|
||||
};
|
||||
|
||||
auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, did_not_overflow.data,
|
||||
subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block]);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0],
|
||||
"ient[block_of_bit * big_lwe_size],
|
||||
"ient[block_of_bit * big_lwe_size],
|
||||
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
|
||||
#ifdef BENCH_LEVEL_4
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_3_3 += milliseconds;
|
||||
#endif
|
||||
};
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(
|
||||
&mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(&mem_ptr->sub_stream_2,
|
||||
&gpu_indexes[0], 1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// quotient
|
||||
set_quotient_bit(&mem_ptr->sub_stream_3, &gpu_indexes[0], 1);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
|
||||
|
||||
assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
|
||||
assert(first_trivial_block - 1 == new_remainder.len);
|
||||
|
||||
remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
|
||||
first_trivial_block - 1, streams[0], gpu_indexes[0]);
|
||||
remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
#ifdef BENCH_HOST_PHASE_3_LEVEL_3
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
total_phase_3 += milliseconds;
|
||||
//printf("----Time for phase_3 operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
printf("----phase_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_1,
|
||||
total_phase_1 / total_bits);
|
||||
{
|
||||
printf("------phase_1_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_1,
|
||||
total_phase_1_1 / total_bits);
|
||||
printf("------phase_1_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_2,
|
||||
total_phase_1_2 / total_bits);
|
||||
printf("------phase_1_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_3,
|
||||
total_phase_1_3 / total_bits);
|
||||
printf("------phase_1_4 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_4,
|
||||
total_phase_1_4 / total_bits);
|
||||
}
|
||||
|
||||
printf("----phase_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_2,
|
||||
total_phase_2 / total_bits);
|
||||
{
|
||||
printf("------phase_2_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_2_1,
|
||||
total_phase_2_1 / total_bits);
|
||||
printf("------phase_2_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_2_2,
|
||||
total_phase_2_2 / total_bits);
|
||||
printf("------phase_2_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_2_3,
|
||||
total_phase_2_3 / total_bits);
|
||||
}
|
||||
// printf("----phase_overflow_sum total = %.3f ms; avg: = %.3f ms\n",
|
||||
// total_phase_overflow,
|
||||
// total_phase_overflow / total_bits);
|
||||
printf("----phase_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_3,
|
||||
total_phase_3 / total_bits);
|
||||
{
|
||||
printf("------phase_3_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_3_1,
|
||||
total_phase_3_1 / total_bits);
|
||||
printf("------phase_3_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_3_2,
|
||||
total_phase_3_2 / total_bits);
|
||||
printf("------phase_3_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_3_3,
|
||||
total_phase_3_3 / total_bits);
|
||||
}
|
||||
|
||||
|
||||
#ifdef BENCH_HOST_PHASE_4_LEVEL_2
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// Record start time
|
||||
cudaEventRecord(start, streams[0]);
|
||||
#endif
|
||||
assert(remainder1.len == remainder2.len);
|
||||
|
||||
// Clean the quotient and remainder
|
||||
// as even though they have no carries, they are not at nominal noise level
|
||||
host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
|
||||
remainder2.data, radix_params.big_lwe_dimension,
|
||||
remainder1.len);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
&mem_ptr->sub_stream_1, &gpu_indexes[0], 1, remainder, remainder, bsk,
|
||||
ksk, num_blocks, mem_ptr->message_extract_lut_1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
&mem_ptr->sub_stream_2, &gpu_indexes[0], 1, quotient, quotient, bsk,
|
||||
ksk, num_blocks, mem_ptr->message_extract_lut_2);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
|
||||
#ifdef BENCH_HOST_PHASE_4_LEVEL_2
|
||||
cudaEventRecord(stop, streams[0]);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
printf("--Time for phase_4 operations: %.3f ms\n", milliseconds);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_DIV_REM_CUH
|
||||
@@ -2,65 +2,58 @@
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *input_blocks, int8_t *mem_ptr, void *ksk, void *bsk,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_base_log, uint32_t ks_level, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_blocks) {
|
||||
cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
|
||||
void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(input_blocks),
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(input_blocks),
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(input_blocks),
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(input_blocks),
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(input_blocks),
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(input_blocks),
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(input_blocks),
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
@@ -73,42 +66,41 @@ void cuda_full_propagation_64_inplace(
|
||||
}
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
scratch_cuda_full_propagation<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, grouping_factor, input_lwe_ciphertext_count,
|
||||
message_modulus, carry_modulus, pbs_type, allocate_gpu_memory);
|
||||
stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_full_propagation(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *mem_ptr =
|
||||
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
auto s = static_cast<cudaStream_t>(stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->lut_buffer, s, gpu_index);
|
||||
cuda_drop_async(mem_ptr->lut_indexes, s, gpu_index);
|
||||
cuda_drop_async(mem_ptr->lut_buffer, stream);
|
||||
cuda_drop_async(mem_ptr->lut_indexes, stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->lwe_indexes, s, gpu_index);
|
||||
cuda_drop_async(mem_ptr->lwe_indexes, stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, s, gpu_index);
|
||||
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, s, gpu_index);
|
||||
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
|
||||
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
|
||||
|
||||
switch (mem_ptr->pbs_type) {
|
||||
case CLASSICAL: {
|
||||
auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(mem_ptr->pbs_buffer);
|
||||
x->release(s, gpu_index);
|
||||
x->release(stream);
|
||||
} break;
|
||||
case MULTI_BIT: {
|
||||
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
|
||||
x->release(s, gpu_index);
|
||||
x->release(stream);
|
||||
} break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
@@ -116,7 +108,7 @@ void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index,
|
||||
}
|
||||
|
||||
void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -129,63 +121,23 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
|
||||
void cuda_propagate_single_carry_kb_64_inplace(cuda_stream_t *stream,
|
||||
void *lwe_array, int8_t *mem_ptr,
|
||||
void *bsk, void *ksk,
|
||||
uint32_t num_blocks) {
|
||||
host_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array),
|
||||
stream, static_cast<uint64_t *>(lwe_array),
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, void *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *output_radix_lwe,
|
||||
void *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *ksk, void *bsk,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
host_apply_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe),
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk), bsk,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void *stream, uint32_t gpu_index,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -82,20 +82,18 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
|
||||
* becomes out = m1 * shift + m2
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void pack_bivariate_blocks(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out,
|
||||
__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_indexes_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, Torus *lwe_indexes_in,
|
||||
uint32_t lwe_dimension, uint32_t shift,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
|
||||
lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -103,9 +101,9 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -121,26 +119,24 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
|
||||
// Compute Keyswitch-PBS
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, lwe_array_in, lut->lwe_indexes_in, ksk,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_in,
|
||||
lut->lwe_indexes_in, ksk, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
|
||||
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
|
||||
lut->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
|
||||
num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
|
||||
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
|
||||
lut->lut_indexes, lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *lut) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// apply_lookup_table_bivariate
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -157,71 +153,23 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
|
||||
// Left message is shifted
|
||||
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
|
||||
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
|
||||
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
|
||||
lut->lwe_indexes_in, big_lwe_dimension, message_modulus,
|
||||
num_radix_blocks);
|
||||
pack_bivariate_blocks(stream, lwe_array_pbs_in, lut->lwe_trivial_indexes,
|
||||
lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
|
||||
big_lwe_dimension, message_modulus, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Apply LUT
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, lwe_array_pbs_in, lut->lwe_trivial_indexes, ksk,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_pbs_in,
|
||||
lut->lwe_trivial_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
|
||||
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
|
||||
lut->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
|
||||
num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb_factor(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
|
||||
uint32_t shift) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
// apply_lookup_table_bivariate
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
// Left message is shifted
|
||||
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
|
||||
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
|
||||
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
|
||||
lut->lwe_indexes_in, big_lwe_dimension, shift,
|
||||
num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Apply LUT
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, lwe_array_pbs_in, lut->lwe_trivial_indexes, ksk,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
|
||||
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
|
||||
lut->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
|
||||
num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
|
||||
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
|
||||
lut->lut_indexes, lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
}
|
||||
|
||||
// Rotates the slice in-place such that the first mid elements of the slice move
|
||||
@@ -287,38 +235,19 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
|
||||
message_modulus, carry_modulus, wrapped_f);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_lookup_table_bivariate_with_factor(
|
||||
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f, int factor) {
|
||||
|
||||
Torus factor_u64 = factor;
|
||||
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
|
||||
Torus lhs = (input / factor_u64) % message_modulus;
|
||||
Torus rhs = (input % factor_u64) % message_modulus;
|
||||
|
||||
return f(lhs, rhs);
|
||||
};
|
||||
|
||||
generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, wrapped_f);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* stream - cuda stream
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for bivariate accumulator
|
||||
* ...
|
||||
* f - wrapping function with two Torus inputs
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {
|
||||
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -328,66 +257,29 @@ void generate_device_accumulator_bivariate(
|
||||
message_modulus, carry_modulus, f);
|
||||
|
||||
// copy host lut and lut_indexes to device
|
||||
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
|
||||
h_lut);
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator with factor scaling for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for bivariate accumulator
|
||||
* ...
|
||||
* f - wrapping function with two Torus inputs
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate_with_factor(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill bivariate accumulator
|
||||
generate_lookup_table_bivariate_with_factor<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
|
||||
factor);
|
||||
|
||||
// copy host lut and lut_indexes to device
|
||||
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
|
||||
h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate accumulator for device pointer
|
||||
* generate bivariate accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for accumulator
|
||||
* ...
|
||||
* f - evaluating function with one Torus input
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *acc, uint32_t glwe_dimension,
|
||||
void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -399,27 +291,24 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
// copy host lut and lut_indexes to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
|
||||
h_lut);
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_sc_prop_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
*mem_ptr = new int_sc_prop_memory<Torus>(
|
||||
stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory);
|
||||
*mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array,
|
||||
void host_propagate_single_carry(cuda_stream_t *stream, Torus *lwe_array,
|
||||
int_sc_prop_memory<Torus> *mem, void *bsk,
|
||||
Torus *ksk, uint32_t num_blocks) {
|
||||
auto params = mem->params;
|
||||
@@ -436,17 +325,15 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto message_acc = mem->message_acc;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk,
|
||||
ksk, num_blocks, luts_array);
|
||||
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
|
||||
luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
big_lwe_size_bytes * num_blocks, stream);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
@@ -454,38 +341,32 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsk, ksk, cur_total_blocks, luts_carry_propagation_sum);
|
||||
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
|
||||
luts_carry_propagation_sum);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
|
||||
cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, stream);
|
||||
space *= 2;
|
||||
}
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
|
||||
host_addition(stream, lwe_array, lwe_array, step_output,
|
||||
glwe_dimension * polynomial_size, num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk,
|
||||
num_blocks, message_acc);
|
||||
stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *overflowed, Torus *lwe_array,
|
||||
void host_propagate_single_sub_borrow(cuda_stream_t *stream, Torus *overflowed,
|
||||
Torus *lwe_array,
|
||||
int_single_borrow_prop_memory<Torus> *mem,
|
||||
void *bsk, Torus *ksk,
|
||||
uint32_t num_blocks) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -500,15 +381,14 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
auto message_acc = mem->message_acc;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk,
|
||||
ksk, num_blocks, luts_array);
|
||||
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
|
||||
luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
big_lwe_size_bytes * num_blocks, stream);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
@@ -516,30 +396,28 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsk, ksk, cur_total_blocks, luts_carry_propagation_sum);
|
||||
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
|
||||
luts_carry_propagation_sum);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
|
||||
cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, stream);
|
||||
space *= 2;
|
||||
}
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
big_lwe_size_bytes, stream);
|
||||
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
|
||||
|
||||
host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
step_output, glwe_dimension * polynomial_size, num_blocks);
|
||||
host_subtraction(stream, lwe_array, lwe_array, step_output,
|
||||
glwe_dimension * polynomial_size, num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk,
|
||||
num_blocks, message_acc);
|
||||
stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -551,8 +429,7 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *input_blocks,
|
||||
void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr,
|
||||
Torus *ksk, void *bsk, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
@@ -561,7 +438,6 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t pbs_level, uint32_t grouping_factor,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
int big_lwe_size = (glwe_dimension * polynomial_size + 1);
|
||||
int small_lwe_size = (lwe_dimension + 1);
|
||||
|
||||
@@ -569,32 +445,29 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto cur_input_block = &input_blocks[i * big_lwe_size];
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
|
||||
mem_ptr->lwe_indexes, cur_input_block, mem_ptr->lwe_indexes, ksk,
|
||||
stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
|
||||
cur_input_block, mem_ptr->lwe_indexes, ksk,
|
||||
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
|
||||
1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
|
||||
mem_ptr->tmp_small_lwe_vector,
|
||||
small_lwe_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
small_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
execute_pbs<Torus>(
|
||||
streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
|
||||
mem_ptr->lwe_indexes, mem_ptr->lut_buffer, mem_ptr->lut_indexes,
|
||||
stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
|
||||
mem_ptr->lut_buffer, mem_ptr->lut_indexes,
|
||||
mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
|
||||
mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), mem_ptr->pbs_type);
|
||||
cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
big_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
if (i < num_blocks - 1) {
|
||||
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
|
||||
host_addition(streams[0], gpu_indexes[0], next_input_block,
|
||||
next_input_block,
|
||||
host_addition(stream, next_input_block, next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
glwe_dimension * polynomial_size, 1);
|
||||
}
|
||||
@@ -603,18 +476,18 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_full_propagation(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_fullprop_buffer<Torus> **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t pbs_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int8_t *pbs_buffer;
|
||||
execute_scratch_pbs<Torus>(
|
||||
stream, gpu_index, &pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, pbs_level, grouping_factor, num_radix_blocks,
|
||||
cuda_get_max_shared_memory(gpu_index), pbs_type, allocate_gpu_memory);
|
||||
execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, pbs_level, grouping_factor,
|
||||
num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index),
|
||||
pbs_type, allocate_gpu_memory);
|
||||
|
||||
// LUT
|
||||
Torus *lut_buffer;
|
||||
@@ -624,7 +497,7 @@ void scratch_cuda_full_propagation(
|
||||
Torus lut_buffer_size =
|
||||
2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
|
||||
|
||||
lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream, gpu_index);
|
||||
lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
|
||||
|
||||
// LUTs
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
@@ -640,36 +513,34 @@ void scratch_cuda_full_propagation(
|
||||
lut_buffer + (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, gpu_index, lut_buffer_message, glwe_dimension, polynomial_size,
|
||||
stream, lut_buffer_message, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_message);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, gpu_index, lut_buffer_carry, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry);
|
||||
generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_carry);
|
||||
}
|
||||
|
||||
Torus *lut_indexes;
|
||||
if (allocate_gpu_memory) {
|
||||
lut_indexes =
|
||||
(Torus *)cuda_malloc_async(2 * sizeof(Torus), stream, gpu_index);
|
||||
lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
|
||||
|
||||
Torus h_lut_indexes[2] = {0, 1};
|
||||
cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
}
|
||||
|
||||
Torus *lwe_indexes;
|
||||
if (allocate_gpu_memory) {
|
||||
Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
|
||||
lwe_indexes =
|
||||
(Torus *)cuda_malloc_async(lwe_indexes_size, stream, gpu_index);
|
||||
lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
|
||||
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
|
||||
stream, gpu_index);
|
||||
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
|
||||
stream);
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback,
|
||||
h_lwe_indexes);
|
||||
}
|
||||
|
||||
@@ -681,10 +552,8 @@ void scratch_cuda_full_propagation(
|
||||
Torus big_vector_size =
|
||||
2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
|
||||
|
||||
small_lwe_vector =
|
||||
(Torus *)cuda_malloc_async(small_vector_size, stream, gpu_index);
|
||||
big_lwe_vector =
|
||||
(Torus *)cuda_malloc_async(big_vector_size, stream, gpu_index);
|
||||
small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
|
||||
big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
|
||||
}
|
||||
|
||||
*mem_ptr = new int_fullprop_buffer<Torus>;
|
||||
@@ -738,16 +607,19 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
//
|
||||
// Expects the carry buffer to be empty
|
||||
template <typename Torus>
|
||||
__host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks,
|
||||
uint32_t factor) {
|
||||
cudaSetDevice(gpu_index);
|
||||
__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t factor) {
|
||||
if (lwe_array_out == lwe_array_in)
|
||||
PANIC("Cuda error in pack blocks: input and output pointers must be "
|
||||
"different.");
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
|
||||
device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
|
||||
}
|
||||
|
||||
@@ -767,16 +639,14 @@ device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *scalar_array,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks,
|
||||
uint32_t num_scalar_blocks, uint64_t message_modulus,
|
||||
uint64_t carry_modulus) {
|
||||
create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *scalar_array, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
|
||||
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
|
||||
|
||||
if (num_scalar_blocks == 0)
|
||||
return;
|
||||
@@ -793,7 +663,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_create_trivial_radix<<<grid, thds, 0, stream>>>(
|
||||
device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
|
||||
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -804,26 +674,23 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
* * (lwe_dimension+1) * sizeeof(Torus) bytes
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
__host__ void extract_n_bits(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks, uint32_t bits_per_block,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk,
|
||||
stream, lwe_array_out, lwe_array_in, bsk, ksk,
|
||||
num_radix_blocks * bits_per_block, bit_extract->lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *signs_array_out,
|
||||
__host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
|
||||
Torus *signs_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f,
|
||||
void *bsk, Torus *ksk, uint32_t num_sign_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
@@ -844,22 +711,20 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto signs_a = diff_buffer->tmp_signs_a;
|
||||
auto signs_b = diff_buffer->tmp_signs_b;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(signs_a, signs_array_in,
|
||||
(big_lwe_dimension + 1) * num_sign_blocks *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
signs_a, signs_array_in,
|
||||
(big_lwe_dimension + 1) * num_sign_blocks * sizeof(Torus), stream);
|
||||
if (num_sign_blocks > 2) {
|
||||
auto lut = diff_buffer->reduce_signs_lut;
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, reduce_two_orderings_function);
|
||||
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, reduce_two_orderings_function);
|
||||
|
||||
while (num_sign_blocks > 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
big_lwe_dimension, num_sign_blocks, 4);
|
||||
pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, num_sign_blocks,
|
||||
4);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsk, ksk,
|
||||
num_sign_blocks / 2, lut);
|
||||
stream, signs_a, signs_b, bsk, ksk, num_sign_blocks / 2, lut);
|
||||
|
||||
auto last_block_signs_b =
|
||||
signs_b + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
|
||||
@@ -868,7 +733,7 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
if (num_sign_blocks % 2 == 1)
|
||||
cuda_memcpy_async_gpu_to_gpu(last_block_signs_a, last_block_signs_b,
|
||||
(big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
|
||||
num_sign_blocks = (num_sign_blocks / 2) + (num_sign_blocks % 2);
|
||||
}
|
||||
@@ -882,14 +747,12 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
};
|
||||
|
||||
auto lut = diff_buffer->reduce_signs_lut;
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, final_lut_f);
|
||||
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, final_lut_f);
|
||||
|
||||
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
|
||||
2, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, signs_array_out,
|
||||
pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, 2, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
|
||||
signs_b, bsk, ksk, 1, lut);
|
||||
|
||||
} else {
|
||||
@@ -900,41 +763,12 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
};
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, final_lut_f);
|
||||
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, final_lut_f);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, signs_array_out,
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
|
||||
signs_a, bsk, ksk, 1, lut);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_apply_univariate_lut_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index, int_radix_lut<Torus> **mem_ptr,
|
||||
Torus *input_lut, uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_radix_lut<Torus>(stream, gpu_index, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
cuda_memcpy_async_to_gpu((*mem_ptr)->lut, input_lut,
|
||||
(params.glwe_dimension + 1) *
|
||||
params.polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *radix_lwe_out,
|
||||
Torus *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus *ksk,
|
||||
void *bsk, uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsk, ksk,
|
||||
num_blocks, mem);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
|
||||
|
||||
@@ -66,12 +66,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
|
||||
* the integer radix multiplication in keyswitch->bootstrap order.
|
||||
*/
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
|
||||
uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
@@ -79,21 +79,14 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
case 512:
|
||||
case 1024:
|
||||
case 2048:
|
||||
case 4096:
|
||||
case 8192:
|
||||
case 16384:
|
||||
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
"Only N = 2048 is supported")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,69 +119,18 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
* - 'max_shared_memory' maximum shared memory per cuda block
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, void *bsk,
|
||||
void *ksk, int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
|
||||
void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
@@ -196,21 +138,20 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
"Only N = 2048 is supported")
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_mult(void *stream, uint32_t gpu_index,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
|
||||
|
||||
int_mul_memory<uint64_t> *mem_ptr =
|
||||
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks_in_radix,
|
||||
@@ -222,15 +163,14 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, params, allocate_gpu_memory);
|
||||
stream, (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks_in_radix, max_num_radix_in_vec, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks_in_radix) {
|
||||
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_vec,
|
||||
uint32_t num_radix_in_vec, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t num_blocks_in_radix) {
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
@@ -244,65 +184,58 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
|
||||
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
|
||||
num_radix_in_vec);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
|
||||
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
|
||||
num_radix_in_vec);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
|
||||
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
|
||||
num_radix_in_vec);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
|
||||
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
|
||||
num_radix_in_vec);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
|
||||
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
|
||||
num_radix_in_vec);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
|
||||
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
|
||||
num_radix_in_vec);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
PANIC("Cuda error (integer sum ciphertexts): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
|
||||
}
|
||||
|
||||
free(terms_degree);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -91,15 +91,12 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, sharedMemDegree SMD>
|
||||
template <typename Torus>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t block_size,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
|
||||
Torus *result = (Torus *)sharedmem;
|
||||
|
||||
extern __shared__ Torus result[];
|
||||
size_t stride = blockDim.x;
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
|
||||
@@ -109,9 +106,6 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
size_t block_stride = blockIdx.y * block_size;
|
||||
auto dst_block = &dst_radix[block_stride];
|
||||
|
||||
if constexpr (SMD == NOSM)
|
||||
result = dst_block;
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
@@ -127,9 +121,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
}
|
||||
|
||||
// put result from shared mem to global mem
|
||||
if constexpr (SMD == FULLSM)
|
||||
for (int i = tid; i < block_size; i += stride)
|
||||
dst_block[i] = result[i];
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
dst_block[i] = result[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -181,44 +175,39 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
cuda_stream_t *stream, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
stream, gpu_index, params, num_blocks_in_radix, max_num_radix_in_vec,
|
||||
stream, params, num_blocks_in_radix, max_num_radix_in_vec,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void *bsk,
|
||||
uint64_t *ksk, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
cuda_stream_t *stream, Torus *radix_lwe_out, Torus *terms,
|
||||
int *terms_degree, void *bsk, uint64_t *ksk,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
|
||||
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
|
||||
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
|
||||
|
||||
@@ -235,7 +224,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
|
||||
num_blocks_in_radix * num_radix_in_vec *
|
||||
big_lwe_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
}
|
||||
|
||||
size_t r = num_radix_in_vec;
|
||||
@@ -248,7 +237,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
int32_t h_smart_copy_in[r * num_blocks];
|
||||
int32_t h_smart_copy_out[r * num_blocks];
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
while (r > 2) {
|
||||
size_t cur_total_blocks = r * num_blocks;
|
||||
@@ -258,14 +247,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
|
||||
if (sm_size < max_shared_memory)
|
||||
tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
else
|
||||
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
|
||||
size_t total_count = 0;
|
||||
size_t message_count = 0;
|
||||
@@ -277,71 +260,36 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
|
||||
total_count, message_count, carry_count, sm_copy_count);
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
auto luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->params, 2, total_count, true);
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(0);
|
||||
auto carry_acc = luts_message_carry->get_lut(1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, lut_f_message);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry);
|
||||
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
|
||||
size_t copy_size = total_count * sizeof(Torus);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, stream);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, stream);
|
||||
copy_size = sm_copy_count * sizeof(int32_t);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
|
||||
smart_copy<<<sm_copy_count, 256, 0, streams[0]>>>(
|
||||
smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
|
||||
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
|
||||
big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (carry_count > 0)
|
||||
cuda_set_value_async<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
luts_message_carry->get_lut_indexes(message_count), 1, carry_count);
|
||||
&(stream->stream), luts_message_carry->get_lut_indexes(message_count),
|
||||
1, carry_count);
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, lwe_indexes_in,
|
||||
new_blocks, lwe_indexes_in, ksk, polynomial_size * glwe_dimension,
|
||||
lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
|
||||
message_count);
|
||||
stream, small_lwe_vector, lwe_indexes_in, new_blocks, lwe_indexes_in,
|
||||
ksk, polynomial_size * glwe_dimension, lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
|
||||
|
||||
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
|
||||
lwe_indexes_out, luts_message_carry->lut,
|
||||
luts_message_carry->lut_indexes, small_lwe_vector,
|
||||
lwe_indexes_in, bsk, luts_message_carry->buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count, 2, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
luts_message_carry->release(streams[0], gpu_indexes[0]);
|
||||
execute_pbs<Torus>(
|
||||
stream, new_blocks, lwe_indexes_out, luts_message_carry->lut,
|
||||
luts_message_carry->lut_indexes, small_lwe_vector, lwe_indexes_in, bsk,
|
||||
luts_message_carry->buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count,
|
||||
2, 0, max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_blocks;
|
||||
@@ -349,29 +297,26 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
|
||||
auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
|
||||
auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_blocks;
|
||||
}
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
host_addition(stream, radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
|
||||
num_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
|
||||
radix_lwe_out, mem_ptr->scp_mem, bsk, ksk,
|
||||
num_blocks);
|
||||
host_propagate_single_carry<Torus>(stream, radix_lwe_out, mem_ptr->scp_mem,
|
||||
bsk, ksk, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
|
||||
cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
|
||||
uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
@@ -438,14 +383,13 @@ __host__ void host_integer_mult_radix_kb(
|
||||
dim3 grid(lsb_vector_block_count, 1, 1);
|
||||
dim3 thds(params::degree / params::opt, 1, 1);
|
||||
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
|
||||
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
|
||||
vector_lsb_rhs, vector_msb_rhs, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
|
||||
vector_result_sb, bsk, ksk, total_block_count, luts_array);
|
||||
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
|
||||
total_block_count, luts_array);
|
||||
|
||||
vector_result_lsb = &block_mul_res[0];
|
||||
vector_result_msb = &block_mul_res[lsb_vector_block_count *
|
||||
@@ -453,10 +397,10 @@ __host__ void host_integer_mult_radix_kb(
|
||||
|
||||
fill_radix_from_lsb_msb<Torus, params>
|
||||
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
|
||||
streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
|
||||
glwe_dimension, lsb_vector_block_count,
|
||||
msb_vector_block_count, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
stream->stream>>>(vector_result_sb, vector_result_lsb,
|
||||
vector_result_msb, glwe_dimension,
|
||||
lsb_vector_block_count, msb_vector_block_count,
|
||||
num_blocks);
|
||||
|
||||
int terms_degree[2 * num_blocks * num_blocks];
|
||||
for (int i = 0; i < num_blocks * num_blocks; i++) {
|
||||
@@ -472,35 +416,25 @@ __host__ void host_integer_mult_radix_kb(
|
||||
}
|
||||
|
||||
host_integer_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
|
||||
terms_degree, bsk, ksk, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks);
|
||||
stream, radix_lwe_out, vector_result_sb, terms_degree, bsk, ksk,
|
||||
mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index, int_mul_memory<Torus> **mem_ptr,
|
||||
cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
*mem_ptr = new int_mul_memory<Torus>(stream, gpu_index, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_negation(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
|
||||
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<uint64_t *>(lwe_array), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus,
|
||||
carry_modulus);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -25,24 +25,21 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
stream, (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks,
|
||||
params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
|
||||
void *radix_lwe_right, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t num_blocks) {
|
||||
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_overflowed,
|
||||
void *radix_lwe_left, void *radix_lwe_right, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
@@ -50,8 +47,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
@@ -59,8 +55,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
@@ -68,8 +63,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
@@ -77,8 +71,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
@@ -86,8 +79,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
@@ -99,11 +91,10 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_overflowing_sub_memory<uint64_t> *mem_ptr =
|
||||
(int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -58,13 +58,12 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *output, Torus *input,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
|
||||
Torus *input, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint64_t message_modulus,
|
||||
uint64_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
@@ -82,7 +81,7 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
|
||||
device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
|
||||
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
|
||||
carry_modulus, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -90,33 +89,30 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_overflowing_sub_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_overflowing_sub_memory<Torus> **mem_ptr, uint32_t num_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int_overflowing_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(
|
||||
stream, gpu_index, params, num_blocks, allocate_gpu_memory);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(stream, params, num_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_overflowing_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
|
||||
Torus *radix_lwe_right, void *bsk, uint64_t *ksk,
|
||||
cuda_stream_t *stream, Torus *radix_lwe_out, Torus *radix_lwe_overflowed,
|
||||
Torus *radix_lwe_left, Torus *radix_lwe_right, void *bsk, uint64_t *ksk,
|
||||
int_overflowing_sub_memory<uint64_t> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
host_unchecked_sub_with_correcting_term(
|
||||
streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
|
||||
radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
|
||||
radix_params.message_modulus, radix_params.carry_modulus,
|
||||
radix_params.message_modulus - 1);
|
||||
stream, radix_lwe_out, radix_lwe_left, radix_lwe_right,
|
||||
radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
|
||||
radix_params.carry_modulus, radix_params.message_modulus - 1);
|
||||
|
||||
host_propagate_single_sub_borrow<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out,
|
||||
mem_ptr->borrow_prop_mem, bsk, ksk, num_blocks);
|
||||
stream, radix_lwe_overflowed, radix_lwe_out, mem_ptr->borrow_prop_mem,
|
||||
bsk, ksk, num_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
cuda_stream_t *stream, void *lwe_array, void *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
|
||||
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
stream, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<uint64_t *>(scalar_input), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
@@ -27,11 +27,10 @@ __global__ void device_integer_radix_scalar_addition_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_addition_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -45,7 +44,8 @@ __host__ void host_integer_radix_scalar_addition_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
|
||||
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -65,11 +65,10 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t lwe_dimension,
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -83,7 +82,8 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
|
||||
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -104,11 +104,10 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
@@ -123,7 +122,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
|
||||
streams[0]>>>(
|
||||
stream->stream>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
|
||||
uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
|
||||
void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_input),
|
||||
static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
|
||||
@@ -6,12 +6,12 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_bitop_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks,
|
||||
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, BITOP_TYPE op) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_input,
|
||||
Torus *clear_blocks, uint32_t num_clear_blocks,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks, BITOP_TYPE op) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto lut = mem_ptr->lut;
|
||||
auto params = lut->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -21,30 +21,28 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
if (num_clear_blocks == 0) {
|
||||
if (op == SCALAR_BITAND) {
|
||||
cuda_memset_async(lwe_array_out, 0,
|
||||
num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_radix_blocks * lwe_size * sizeof(Torus), stream);
|
||||
} else {
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
|
||||
num_radix_blocks * lwe_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
}
|
||||
} else {
|
||||
// We have all possible LUTs pre-computed and we use the decomposed scalar
|
||||
// as index to recover the right one
|
||||
cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
|
||||
num_clear_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_clear_blocks * sizeof(Torus), stream);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsk,
|
||||
ksk, num_clear_blocks, lut);
|
||||
stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
|
||||
lut);
|
||||
|
||||
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
|
||||
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
|
||||
cuda_memset_async(lwe_array_out_block, 0,
|
||||
(num_radix_blocks - num_clear_blocks) * lwe_size *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
#include "integer/scalar_comparison.cuh"
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
|
||||
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count,
|
||||
uint32_t num_scalar_blocks) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *scalar_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
@@ -12,8 +11,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_scalar_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
|
||||
@@ -23,8 +21,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
case LT:
|
||||
case LE:
|
||||
host_integer_radix_scalar_difference_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer,
|
||||
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
|
||||
@@ -33,8 +30,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
case MAX:
|
||||
case MIN:
|
||||
host_integer_radix_scalar_maxmin_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
|
||||
|
||||
@@ -6,13 +6,12 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -47,10 +46,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
if (total_num_scalar_blocks == 0) {
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_in,
|
||||
mem_ptr, bsk, ksk, total_num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality(
|
||||
stream, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsk, ksk,
|
||||
total_num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
|
||||
@@ -59,13 +57,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
};
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_last_leaf_lut_f);
|
||||
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, scalar_last_leaf_lut_f);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
|
||||
|
||||
} else if (total_num_scalar_blocks < total_num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -79,7 +76,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
|
||||
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(stream);
|
||||
auto lsb_stream = mem_ptr->lsb_stream;
|
||||
auto msb_stream = mem_ptr->msb_stream;
|
||||
|
||||
@@ -93,10 +90,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -108,15 +105,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
|
||||
comparisons, lhs, rhs, mem_ptr, bsk, ksk,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
|
||||
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsk, ksk,
|
||||
num_lsb_radix_blocks);
|
||||
}
|
||||
@@ -124,13 +120,13 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality(
|
||||
&msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, msb, mem_ptr,
|
||||
bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
|
||||
mem_ptr, bsk, ksk, num_msb_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(lsb_stream);
|
||||
cuda_synchronize_stream(msb_stream);
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
@@ -145,12 +141,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
|
||||
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, scalar_bivariate_last_leaf_lut_f);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
lwe_array_msb_out, bsk, ksk, 1, lut);
|
||||
stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
|
||||
1, lut);
|
||||
|
||||
} else {
|
||||
// We only have to do the regular comparison
|
||||
@@ -162,10 +158,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
pack_blocks(stream, lhs, lwe_array_in, big_lwe_dimension,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(stream, rhs, scalar_blocks, 0, num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -176,28 +172,26 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lhs, rhs, mem_ptr, bsk, ksk,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk,
|
||||
ksk, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsk, ksk, num_lsb_radix_blocks);
|
||||
tree_sign_reduction(stream, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
|
||||
ksk, num_lsb_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -233,9 +227,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
|
||||
host_compare_with_zero_equality(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
|
||||
mem_ptr, bsk, ksk, total_num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality(stream, are_all_msb_zeros, lwe_array_in,
|
||||
mem_ptr, bsk, ksk, total_num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
Torus *sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
|
||||
@@ -276,12 +270,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
|
||||
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, scalar_bivariate_last_leaf_lut_f);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
|
||||
sign_block, bsk, ksk, 1, lut);
|
||||
stream, lwe_array_out, are_all_msb_zeros, sign_block, bsk, ksk, 1, lut);
|
||||
|
||||
} else if (total_num_scalar_blocks < total_num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -295,7 +288,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
|
||||
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(stream);
|
||||
auto lsb_stream = mem_ptr->lsb_stream;
|
||||
auto msb_stream = mem_ptr->msb_stream;
|
||||
|
||||
@@ -309,10 +302,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -324,15 +317,14 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
|
||||
comparisons, lhs, rhs, mem_ptr, bsk, ksk,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
|
||||
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsk, ksk,
|
||||
num_lsb_radix_blocks);
|
||||
}
|
||||
@@ -342,9 +334,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
Torus *are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_with_zero_equality(
|
||||
&msb_stream, &gpu_indexes[0], 1, are_all_msb_zeros, msb, mem_ptr,
|
||||
bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality(msb_stream, are_all_msb_zeros, msb,
|
||||
mem_ptr, bsk, ksk, num_msb_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
|
||||
@@ -372,23 +364,23 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
auto signed_msb_lut = mem_ptr->signed_msb_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
msb_stream, gpu_indexes[0], signed_msb_lut->lut,
|
||||
params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f);
|
||||
msb_stream, signed_msb_lut->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f);
|
||||
|
||||
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
&msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, sign_block,
|
||||
are_all_msb_zeros, bsk, ksk, 1, signed_msb_lut);
|
||||
msb_stream, lwe_array_msb_out, sign_block, are_all_msb_zeros, bsk,
|
||||
ksk, 1, signed_msb_lut);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(lsb_stream);
|
||||
cuda_synchronize_stream(msb_stream);
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsk, ksk, 2);
|
||||
reduce_signs(stream, lwe_array_out, lwe_array_lsb_out, mem_ptr,
|
||||
sign_handler_f, bsk, ksk, 2);
|
||||
|
||||
} else {
|
||||
// We only have to do the regular comparison
|
||||
@@ -396,7 +388,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// total_num_radix_blocks == total_num_scalar_blocks
|
||||
uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(stream);
|
||||
auto lsb_stream = mem_ptr->lsb_stream;
|
||||
auto msb_stream = mem_ptr->msb_stream;
|
||||
|
||||
@@ -411,11 +403,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -424,9 +415,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr, bsk,
|
||||
ksk, num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb(lsb_stream, lwe_array_ct_out, lhs, rhs,
|
||||
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
@@ -436,36 +426,34 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
create_trivial_radix(msb_stream, gpu_indexes[0], trivial_sign_block,
|
||||
scalar_sign_block, big_lwe_dimension, 1, 1,
|
||||
message_modulus, carry_modulus);
|
||||
create_trivial_radix(msb_stream, trivial_sign_block, scalar_sign_block,
|
||||
big_lwe_dimension, 1, 1, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
&msb_stream, &gpu_indexes[0], 1, lwe_array_sign_out,
|
||||
encrypted_sign_block, trivial_sign_block, bsk, ksk, 1,
|
||||
mem_ptr->signed_lut);
|
||||
msb_stream, lwe_array_sign_out, encrypted_sign_block,
|
||||
trivial_sign_block, bsk, ksk, 1, mem_ptr->signed_lut);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(lsb_stream);
|
||||
cuda_synchronize_stream(msb_stream);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_ct_out, mem_ptr, sign_handler_f, bsk, ksk,
|
||||
num_lsb_radix_blocks + 1);
|
||||
reduce_signs(stream, lwe_array_out, lwe_array_ct_out, mem_ptr,
|
||||
sign_handler_f, bsk, ksk, num_lsb_radix_blocks + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
// Calculates the difference sign between the ciphertext and the scalar
|
||||
// - 0 if lhs < rhs
|
||||
@@ -473,8 +461,8 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
integer_radix_signed_scalar_difference_check_kb(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
|
||||
stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
|
||||
mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
|
||||
// There is no optimized CMUX for scalars, so we convert to a trivial
|
||||
@@ -482,70 +470,66 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
auto lwe_array_left = lwe_array_in;
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
create_trivial_radix(stream, lwe_array_right, scalar_blocks,
|
||||
params.big_lwe_dimension, total_num_radix_blocks,
|
||||
total_num_scalar_blocks, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb(stream, lwe_array_out, sign, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
|
||||
total_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_difference_check_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
|
||||
sign_handler_f, bsk, ksk, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_difference_check_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
|
||||
sign_handler_f, bsk, ksk, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_signed_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_maxmin_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_maxmin_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -565,38 +549,37 @@ scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
|
||||
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
subtracted_blocks, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace(
|
||||
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
|
||||
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
|
||||
stream, subtracted_blocks, scalar_blocks, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsk,
|
||||
ksk, num_radix_blocks, sign_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
|
||||
subtracted_blocks, bsk, ksk,
|
||||
num_radix_blocks, sign_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
|
||||
big_lwe_dimension, num_radix_blocks,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// Calculates the difference sign between the ciphertext and the scalar
|
||||
@@ -605,8 +588,8 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
host_integer_radix_scalar_difference_check_kb(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
|
||||
stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
|
||||
mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
|
||||
// There is no optimized CMUX for scalars, so we convert to a trivial
|
||||
@@ -614,27 +597,24 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
auto lwe_array_left = lwe_array_in;
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
create_trivial_radix(stream, lwe_array_right, scalar_blocks,
|
||||
params.big_lwe_dimension, total_num_radix_blocks,
|
||||
total_num_scalar_blocks, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb(
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -661,7 +641,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
auto lwe_array_msb_out =
|
||||
lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
auto lsb_stream = mem_ptr->lsb_stream;
|
||||
auto msb_stream = mem_ptr->msb_stream;
|
||||
@@ -676,19 +656,18 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
auto packed_scalar =
|
||||
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], packed_blocks, lsb,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, gpu_indexes[0], packed_scalar, scalar_blocks, 0,
|
||||
pack_blocks(lsb_stream, packed_blocks, lsb, big_lwe_dimension,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, packed_scalar, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(scalar_comparison_luts->lut_indexes,
|
||||
packed_scalar,
|
||||
num_halved_scalar_blocks * sizeof(Torus),
|
||||
lsb_stream, gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->lut_indexes, packed_scalar,
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_stream);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out, packed_blocks,
|
||||
bsk, ksk, num_halved_lsb_radix_blocks, scalar_comparison_luts);
|
||||
lsb_stream, lwe_array_lsb_out, packed_blocks, bsk, ksk,
|
||||
num_halved_lsb_radix_blocks, scalar_comparison_luts);
|
||||
}
|
||||
}
|
||||
#pragma omp section
|
||||
@@ -708,27 +687,25 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
host_compare_with_zero_equality(&msb_stream, &gpu_indexes[0], 1,
|
||||
lwe_array_msb_out, msb, mem_ptr, bsk,
|
||||
ksk, num_msb_radix_blocks, msb_lut);
|
||||
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
|
||||
mem_ptr, bsk, ksk, num_msb_radix_blocks,
|
||||
msb_lut);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(lsb_stream);
|
||||
cuda_synchronize_stream(msb_stream);
|
||||
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
are_all_comparisons_block_true(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
mem_ptr, bsk, ksk,
|
||||
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
is_at_least_one_comparisons_block_true(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
mem_ptr, bsk, ksk,
|
||||
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "integer/scalar_mul.cuh"
|
||||
|
||||
void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
@@ -13,22 +13,20 @@ void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
stream, (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem,
|
||||
void *bsk, void *ksk, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) {
|
||||
cuda_stream_t *stream, void *lwe_array, uint64_t *decomposed_scalar,
|
||||
uint64_t *has_at_least_one_set, int8_t *mem, void *bsk, void *ksk,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t num_blocks, uint32_t num_scalars) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
|
||||
@@ -36,8 +34,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
|
||||
@@ -45,8 +42,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
|
||||
@@ -54,8 +50,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
|
||||
@@ -63,8 +58,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
|
||||
@@ -72,8 +66,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
|
||||
has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
|
||||
@@ -85,12 +78,12 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int_scalar_mul_buffer<uint64_t> *mem_ptr =
|
||||
(int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -29,43 +29,33 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
|
||||
|
||||
template <typename T>
|
||||
__host__ void scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
cuda_stream_t *stream, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(
|
||||
stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory);
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename T, class params>
|
||||
__host__ void host_integer_scalar_mul_radix(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set,
|
||||
int_scalar_mul_buffer<T> *mem, void *bsk, T *ksk,
|
||||
cuda_stream_t *stream, T *lwe_array, T *decomposed_scalar,
|
||||
T *has_at_least_one_set, int_scalar_mul_buffer<T> *mem, void *bsk, T *ksk,
|
||||
uint32_t input_lwe_dimension, uint32_t message_modulus,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalars) {
|
||||
|
||||
if (num_radix_blocks == 0 | num_scalars == 0)
|
||||
return;
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
uint32_t lwe_size = input_lwe_dimension + 1;
|
||||
@@ -80,15 +70,13 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
T *ptr = preshifted_buffer + shift_amount * lwe_size * num_radix_blocks;
|
||||
if (has_at_least_one_set[shift_amount] == 1) {
|
||||
cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
|
||||
lwe_size_bytes * num_radix_blocks,
|
||||
streams[0], gpu_indexes[0]);
|
||||
lwe_size_bytes * num_radix_blocks, stream);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, ptr, shift_amount,
|
||||
mem->logical_scalar_shift_buffer, bsk, ksk, num_radix_blocks);
|
||||
stream, ptr, shift_amount, mem->logical_scalar_shift_buffer, bsk, ksk,
|
||||
num_radix_blocks);
|
||||
} else {
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, stream);
|
||||
}
|
||||
}
|
||||
size_t j = 0;
|
||||
@@ -99,40 +87,37 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
|
||||
T *block_shift_buffer =
|
||||
all_shifted_buffer + j * num_radix_blocks * lwe_size;
|
||||
radix_blocks_rotate_right<<<num_radix_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<num_radix_blocks, 256, 0, stream->stream>>>(
|
||||
block_shift_buffer, preshifted_radix_ct, i / msg_bits,
|
||||
num_radix_blocks, lwe_size);
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
if (j == 0) {
|
||||
// lwe array = 0
|
||||
cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes, stream);
|
||||
} else {
|
||||
int terms_degree[j * num_radix_blocks];
|
||||
for (int i = 0; i < j * num_radix_blocks; i++) {
|
||||
terms_degree[i] = message_modulus - 1;
|
||||
}
|
||||
host_integer_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
|
||||
terms_degree, bsk, ksk, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
|
||||
j);
|
||||
stream, lwe_array, all_shifted_buffer, terms_degree, bsk, ksk,
|
||||
mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
|
||||
}
|
||||
}
|
||||
|
||||
// Small scalar_mul is used in shift/rotate
|
||||
template <typename T>
|
||||
__host__ void host_integer_small_scalar_mul_radix(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
T *output_lwe_array, T *input_lwe_array, T scalar,
|
||||
cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -143,7 +128,7 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
device_small_scalar_radix_multiplication<<<grid, thds, 0, streams[0]>>>(
|
||||
device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
|
||||
output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "scalar_rotate.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -15,28 +15,27 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
params, shift_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t n, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
|
||||
void *lwe_array, uint32_t n,
|
||||
int8_t *mem_ptr, void *bsk,
|
||||
void *ksk,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), n,
|
||||
stream, static_cast<uint64_t *>(lwe_array), n,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -13,25 +13,22 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
stream, gpu_index, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer<Torus> *mem,
|
||||
void *bsk, Torus *ksk, uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -60,12 +57,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
// block_count blocks will be used in the grid
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0) {
|
||||
return;
|
||||
@@ -73,21 +69,20 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsk, ksk, num_blocks, lut_bivariate);
|
||||
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
|
||||
lut_bivariate);
|
||||
|
||||
} else {
|
||||
// left shift
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0) {
|
||||
return;
|
||||
@@ -95,12 +90,12 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsk, ksk, num_blocks, lut_bivariate);
|
||||
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
|
||||
lut_bivariate);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "scalar_shifts.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -15,9 +15,8 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
params, shift_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
/// The logical scalar shift is the one used for unsigned integers, and
|
||||
@@ -25,19 +24,17 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
/// the application of a PBS onto the rotated blocks up to num_blocks -
|
||||
/// rotations - 1 The remaining blocks are padded with zeros
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t num_blocks) {
|
||||
cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *bsk, void *ksk, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), shift,
|
||||
stream, static_cast<uint64_t *>(lwe_array), shift,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_blocks);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -51,9 +48,8 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
params, shift_type, allocate_gpu_memory);
|
||||
stream, (int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, shift_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
/// The arithmetic scalar shift is the one used for the signed right shift.
|
||||
@@ -64,35 +60,31 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
/// block, which is copied onto all remaining blocks instead of padding with
|
||||
/// zeros as would be done in the logical shift.
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t num_blocks) {
|
||||
cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *bsk, void *ksk, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), shift,
|
||||
stream, static_cast<uint64_t *>(lwe_array), shift,
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef CUDA_INTEGER_SCALAR_SHIFT_CUH
|
||||
#define CUDA_INTEGER_SCALAR_SHIFT_CUH
|
||||
#ifndef CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
|
||||
#define CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
@@ -14,26 +14,22 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
stream, gpu_index, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t shift,
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -63,15 +59,14 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
// block_count blocks will be used in the grid
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0 || rotations == num_blocks) {
|
||||
return;
|
||||
@@ -84,23 +79,20 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, partial_previous_blocks, bsk, ksk,
|
||||
partial_block_count, lut_bivariate);
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_previous_blocks, bsk, ksk, partial_block_count, lut_bivariate);
|
||||
|
||||
} else {
|
||||
// right shift
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size,
|
||||
0, rotations * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
0, rotations * big_lwe_size_bytes, stream);
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (shift_within_block == 0 || rotations == num_blocks) {
|
||||
return;
|
||||
@@ -112,34 +104,29 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, partial_next_blocks, bsk, ksk,
|
||||
partial_block_count, lut_bivariate);
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
cuda_stream_t *stream, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
|
||||
stream, gpu_index, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, uint32_t shift,
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -169,11 +156,10 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
if (mem->shift_type == RIGHT_SHIFT) {
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_blocks * big_lwe_size_bytes, stream);
|
||||
|
||||
if (num_bits_in_block == 1) {
|
||||
// if there is only 1 bit in the msg part, it means shift_within block is
|
||||
@@ -189,7 +175,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
rotated_buffer + (num_blocks - rotations) * big_lwe_size;
|
||||
for (uint i = 0; i < num_blocks; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(block_dest, block_src, big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
block_dest += big_lwe_size;
|
||||
}
|
||||
return;
|
||||
@@ -199,49 +185,47 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
// bit. This creates the need for a different shifting lut than in the
|
||||
// logical shift case. We also need another PBS to create the padding block.
|
||||
Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
last_block_copy,
|
||||
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(last_block_copy,
|
||||
rotated_buffer + (num_blocks - rotations - 1) *
|
||||
big_lwe_size,
|
||||
big_lwe_size_bytes, stream);
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
if (shift_within_block != 0 && rotations != num_blocks) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, partial_next_blocks, bsk, ksk,
|
||||
partial_block_count, lut_bivariate);
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
// assert the work in the main stream is completed
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
stream->synchronize();
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// All sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
&mem->local_stream_1, &gpu_indexes[0], 1, padding_block,
|
||||
last_block_copy, bsk, ksk, 1, lut_univariate_padding_block);
|
||||
mem->local_stream_1, padding_block, last_block_copy, bsk, ksk, 1,
|
||||
lut_univariate_padding_block);
|
||||
// Replace blocks 'pulled' from the left with the correct padding block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
lwe_array + (num_blocks - rotations + i) * big_lwe_size,
|
||||
padding_block, big_lwe_size_bytes, mem->local_stream_1,
|
||||
gpu_indexes[0]);
|
||||
padding_block, big_lwe_size_bytes, mem->local_stream_1);
|
||||
}
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
if (shift_within_block != 0 && rotations != num_blocks) {
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
&mem->local_stream_2, &gpu_indexes[0], 1, last_block,
|
||||
last_block_copy, bsk, ksk, 1, lut_univariate_shift_last_block);
|
||||
mem->local_stream_2, last_block, last_block_copy, bsk, ksk, 1,
|
||||
lut_univariate_shift_last_block);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(mem->local_stream_1, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem->local_stream_2, gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem->local_stream_1);
|
||||
cuda_synchronize_stream(mem->local_stream_2);
|
||||
|
||||
} else {
|
||||
PANIC("Cuda error (scalar shift): left scalar shift is never of the "
|
||||
@@ -249,4 +233,4 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_SCALAR_SHIFT_CUH
|
||||
#endif // CUDA_SCALAR_OPS_CUH
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "shift_and_rotate.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
@@ -15,29 +15,26 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
(int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, is_signed, allocate_gpu_memory);
|
||||
stream, (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
params, shift_type, is_signed, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
void *lwe_shift, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t num_blocks) {
|
||||
cuda_stream_t *stream, void *lwe_array, void *lwe_shift, int8_t *mem_ptr,
|
||||
void *bsk, void *ksk, uint32_t num_blocks) {
|
||||
|
||||
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_shift),
|
||||
stream, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<uint64_t *>(lwe_shift),
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
@@ -14,37 +14,33 @@
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
int_shift_and_rotate_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_stream_t *stream, int_shift_and_rotate_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
|
||||
stream, gpu_index, shift_type, is_signed, params, num_radix_blocks,
|
||||
stream, shift_type, is_signed, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer<Torus> *mem,
|
||||
void *bsk, Torus *ksk, uint32_t num_radix_blocks) {
|
||||
cuda_stream_t *stream, Torus *lwe_array, Torus *lwe_shift,
|
||||
int_shift_and_rotate_buffer<Torus> *mem, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks) {
|
||||
uint32_t bits_per_block = std::log2(mem->params.message_modulus);
|
||||
uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
|
||||
if (total_nb_bits == 0)
|
||||
return;
|
||||
|
||||
auto big_lwe_dimension = mem->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
assert(total_nb_bits > 0);
|
||||
|
||||
// Extract all bits
|
||||
auto bits = mem->tmp_bits;
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsk,
|
||||
ksk, num_radix_blocks, bits_per_block,
|
||||
mem->bit_extract_luts);
|
||||
extract_n_bits<Torus>(stream, bits, lwe_array, bsk, ksk, num_radix_blocks,
|
||||
bits_per_block, mem->bit_extract_luts);
|
||||
|
||||
// Extract shift bits
|
||||
auto shift_bits = mem->tmp_shift_bits;
|
||||
@@ -63,8 +59,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// Extracts bits and put them in the bit index 2 (=> bit number 3)
|
||||
// so that it is already aligned to the correct position of the cmux input
|
||||
// and we reduce noise growth
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
|
||||
bsk, ksk, 1, max_num_bits_that_tell_shift,
|
||||
extract_n_bits<Torus>(stream, shift_bits, lwe_shift, bsk, ksk, 1,
|
||||
max_num_bits_that_tell_shift,
|
||||
mem->bit_extract_luts_with_offset_2);
|
||||
|
||||
// If signed, do an "arithmetic shift" by padding with the sign bit
|
||||
@@ -78,50 +74,47 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
auto mux_inputs = mem->tmp_mux_inputs;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(input_bits_a, bits,
|
||||
total_nb_bits * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
total_nb_bits * big_lwe_size_bytes, stream);
|
||||
for (int d = 0; d < max_num_bits_that_tell_shift; d++) {
|
||||
auto shift_bit = shift_bits + d * big_lwe_size;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(input_bits_b, input_bits_a,
|
||||
total_nb_bits * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
total_nb_bits * big_lwe_size_bytes, stream);
|
||||
|
||||
auto rotations = 1 << d;
|
||||
switch (mem->shift_type) {
|
||||
case LEFT_SHIFT:
|
||||
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, stream->stream>>>(
|
||||
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
|
||||
|
||||
if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
|
||||
for (int i = 0; i < rotations; i++)
|
||||
cuda_memcpy_async_gpu_to_gpu(rotated_input + i * big_lwe_size,
|
||||
last_bit, big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
last_bit, big_lwe_size_bytes, stream);
|
||||
else
|
||||
cuda_memset_async(rotated_input, 0, rotations * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
break;
|
||||
case RIGHT_SHIFT:
|
||||
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, stream->stream>>>(
|
||||
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
|
||||
|
||||
if (mem->is_signed)
|
||||
for (int i = 0; i < rotations; i++)
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
rotated_input + (total_nb_bits - rotations + i) * big_lwe_size,
|
||||
last_bit, big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
last_bit, big_lwe_size_bytes, stream);
|
||||
else
|
||||
cuda_memset_async(
|
||||
rotated_input + (total_nb_bits - rotations) * big_lwe_size, 0,
|
||||
rotations * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(rotated_input +
|
||||
(total_nb_bits - rotations) * big_lwe_size,
|
||||
0, rotations * big_lwe_size_bytes, stream);
|
||||
break;
|
||||
case LEFT_ROTATE:
|
||||
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, stream->stream>>>(
|
||||
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
|
||||
break;
|
||||
case RIGHT_ROTATE:
|
||||
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, stream->stream>>>(
|
||||
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
|
||||
break;
|
||||
default:
|
||||
@@ -131,23 +124,21 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// pack bits into one block so that we have
|
||||
// control_bit|b|a
|
||||
cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]); // Do we need this?
|
||||
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
|
||||
mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
|
||||
mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
|
||||
total_nb_bits);
|
||||
stream); // Do we need this?
|
||||
pack_bivariate_blocks(stream, mux_inputs, mux_lut->lwe_indexes_out,
|
||||
rotated_input, input_bits_a, mux_lut->lwe_indexes_in,
|
||||
big_lwe_dimension, 2, total_nb_bits);
|
||||
|
||||
// The shift bit is already properly aligned/positioned
|
||||
for (int i = 0; i < total_nb_bits; i++)
|
||||
host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
|
||||
host_addition(stream, mux_inputs + i * big_lwe_size,
|
||||
mux_inputs + i * big_lwe_size, shift_bit,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
// we have
|
||||
// control_bit|b|a
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsk, ksk,
|
||||
total_nb_bits, mux_lut);
|
||||
stream, input_bits_a, mux_inputs, bsk, ksk, total_nb_bits, mux_lut);
|
||||
}
|
||||
|
||||
// Initializes the output
|
||||
@@ -156,7 +147,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
last_bit = input_bits_a + (bits_per_block - 1) * big_lwe_size;
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_last_out, last_bit, big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
stream);
|
||||
|
||||
lwe_last_out += big_lwe_size;
|
||||
last_bit += bits_per_block * big_lwe_size;
|
||||
@@ -167,15 +158,14 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
for (int i = bits_per_block - 2; i >= 0; i--) {
|
||||
|
||||
host_integer_small_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, 2,
|
||||
big_lwe_dimension, num_radix_blocks);
|
||||
stream, lwe_last_out, lwe_last_out, 2, big_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
|
||||
auto block = lwe_last_out;
|
||||
auto bit_to_add = input_bits_a + i * big_lwe_size;
|
||||
|
||||
for (int j = 0; j < num_radix_blocks; j++) {
|
||||
host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
|
||||
big_lwe_dimension, 1);
|
||||
host_addition(stream, block, block, bit_to_add, big_lwe_dimension, 1);
|
||||
|
||||
block += big_lwe_size;
|
||||
bit_to_add += bits_per_block * big_lwe_size;
|
||||
@@ -184,8 +174,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// To give back a clean ciphertext
|
||||
auto cleaning_lut = mem->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsk, ksk,
|
||||
num_radix_blocks, cleaning_lut);
|
||||
stream, lwe_last_out, lwe_last_out, bsk, ksk, num_radix_blocks,
|
||||
cleaning_lut);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -4,15 +4,14 @@
|
||||
* Perform the addition of two u32 input LWE ciphertext vectors.
|
||||
* See the equivalent operation on u64 ciphertexts for more details.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
host_addition(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
@@ -44,15 +43,14 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
* vectors are left unchanged. This function is a wrapper to a device function
|
||||
* that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
host_addition(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
@@ -62,12 +60,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
* plaintext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
host_addition_plaintext(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
@@ -101,12 +98,11 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
* performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
host_addition_plaintext(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
|
||||
@@ -6,10 +6,9 @@
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "../utils/kernel_dimensions.cuh"
|
||||
#include "device.h"
|
||||
#include "helper.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename T>
|
||||
@@ -28,22 +27,21 @@ __global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
T *lwe_input, T *plaintext_input,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
|
||||
__host__ void host_addition_plaintext(cuda_stream_t *stream, T *output,
|
||||
T *lwe_input, T *plaintext_input,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
|
||||
(lwe_dimension + 1) * lwe_ciphertext_count,
|
||||
stream, gpu_index);
|
||||
plaintext_addition<<<grid, thds, 0, stream>>>(
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count, stream);
|
||||
plaintext_addition<<<grid, thds, 0, stream->stream>>>(
|
||||
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -62,12 +60,11 @@ __global__ void addition(T *output, T *input_1, T *input_2,
|
||||
|
||||
// Coefficient-wise addition
|
||||
template <typename T>
|
||||
__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
T *input_1, T *input_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
__host__ void host_addition(cuda_stream_t *stream, T *output, T *input_1,
|
||||
T *input_2, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -78,7 +75,8 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
addition<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
|
||||
num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -96,12 +94,11 @@ __global__ void subtraction(T *output, T *input_1, T *input_2,
|
||||
|
||||
// Coefficient-wise subtraction
|
||||
template <typename T>
|
||||
__host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *input_1, T *input_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
__host__ void host_subtraction(cuda_stream_t *stream, T *output, T *input_1,
|
||||
T *input_2, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -112,7 +109,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
subtraction<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
|
||||
num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -132,13 +130,12 @@ __global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_subtraction_plaintext(cudaStream_t stream,
|
||||
uint32_t gpu_index, T *output,
|
||||
__host__ void host_subtraction_plaintext(cuda_stream_t *stream, T *output,
|
||||
T *lwe_input, T *plaintext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
@@ -148,9 +145,9 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
|
||||
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
|
||||
input_lwe_ciphertext_count *
|
||||
(input_lwe_dimension + 1) * sizeof(T),
|
||||
stream, gpu_index);
|
||||
stream);
|
||||
|
||||
radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
|
||||
radix_body_subtraction_inplace<<<grid, thds, 0, stream->stream>>>(
|
||||
output, plaintext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -178,11 +175,11 @@ __global__ void unchecked_sub_with_correcting_term(
|
||||
template <typename T>
|
||||
|
||||
__host__ void host_unchecked_sub_with_correcting_term(
|
||||
cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
|
||||
cuda_stream_t *stream, T *output, T *input_1, T *input_2,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -193,7 +190,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
|
||||
unchecked_sub_with_correcting_term<<<grid, thds, 0, stream->stream>>>(
|
||||
output, input_1, input_2, num_entries, lwe_size, message_modulus,
|
||||
carry_modulus, degree);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -5,12 +5,11 @@
|
||||
* cleartext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
host_cleartext_multiplication(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
@@ -45,12 +44,11 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
* function that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
host_cleartext_multiplication(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
|
||||
@@ -29,12 +29,11 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
host_cleartext_multiplication(cuda_stream_t *stream, T *output, T *lwe_input,
|
||||
T *cleartext_input, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -45,7 +44,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_multiplication<<<grid, thds, 0, stream>>>(
|
||||
cleartext_multiplication<<<grid, thds, 0, stream->stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -4,14 +4,13 @@
|
||||
* Perform the negation of a u32 input LWE ciphertext vector.
|
||||
* See the equivalent operation on u64 ciphertexts for more details.
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
host_negation(stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
@@ -38,14 +37,13 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
|
||||
* device function that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
host_negation(stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -22,11 +22,11 @@ __global__ void negation(T *output, T *input, uint32_t num_entries) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
T *input, uint32_t input_lwe_dimension,
|
||||
__host__ void host_negation(cuda_stream_t *stream, T *output, T *input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
|
||||
negation<<<grid, thds, 0, stream->stream>>>(output, input, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -1,38 +1,36 @@
|
||||
#include "bootstrapping_key.cuh"
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_32(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_programmable_bootstrap_key<uint32_t, int32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
|
||||
(int32_t *)src, polynomial_size, total_polynomials);
|
||||
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_programmable_bootstrap_key<uint64_t, int64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
|
||||
(int64_t *)src, polynomial_size, total_polynomials);
|
||||
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor) {
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor) {
|
||||
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
level_count * (1 << grouping_factor) /
|
||||
grouping_factor;
|
||||
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
|
||||
|
||||
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
|
||||
static_cast<cudaStream_t>(stream), gpu_index);
|
||||
stream);
|
||||
}
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
|
||||
@@ -60,13 +60,12 @@ __device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
}
|
||||
////////////////////////////////////////////////
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
double2 *dest, ST *src,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
void cuda_convert_lwe_programmable_bootstrap_key(
|
||||
double2 *dest, ST *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
// Here the buffer size is the size of double2 times the number of polynomials
|
||||
@@ -80,7 +79,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
|
||||
double2 *h_bsk = (double2 *)malloc(buffer_size);
|
||||
|
||||
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream);
|
||||
|
||||
// compress real bsk to complex and divide it on DOUBLE_MAX
|
||||
for (int i = 0; i < total_polynomials; i++) {
|
||||
@@ -97,12 +96,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
}
|
||||
}
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
|
||||
|
||||
double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
double2 *buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -110,17 +109,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -128,17 +127,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -146,17 +145,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -164,17 +163,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -182,17 +181,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -200,17 +199,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -218,13 +217,13 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
d_bsk, dest, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -232,17 +231,16 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
|
||||
cuda_drop_async(d_bsk, stream, gpu_index);
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
cuda_drop_async(d_bsk, stream);
|
||||
cuda_drop_async(buffer, stream);
|
||||
free(h_bsk);
|
||||
}
|
||||
|
||||
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
void *_input1, void *_input2, void *_output,
|
||||
void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto input1 = (double2 *)_input1;
|
||||
auto input2 = (double2 *)_input2;
|
||||
auto output = (double2 *)_output;
|
||||
@@ -255,8 +253,8 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
@@ -266,18 +264,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
@@ -287,18 +286,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
@@ -308,18 +308,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
@@ -329,18 +330,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
@@ -350,18 +352,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
@@ -371,18 +374,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
@@ -393,19 +397,20 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
|
||||
input1, input2, output, buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream);
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
|
||||
buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
cuda_drop_async(buffer, stream);
|
||||
}
|
||||
|
||||
#endif // CNCRT_BSK_H
|
||||
|
||||
@@ -1,44 +1 @@
|
||||
#include "programmable_bootstrap.cuh"
|
||||
|
||||
template <>
|
||||
__device__ int get_this_block_rank(grid_group &group, bool support_dsm) {
|
||||
return blockIdx.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ double2 *
|
||||
get_join_buffer_element(int level_id, int glwe_id, grid_group &group,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm) {
|
||||
double2 *buffer_slice =
|
||||
global_memory_buffer +
|
||||
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
|
||||
return buffer_slice;
|
||||
}
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <>
|
||||
__device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) {
|
||||
if (support_dsm)
|
||||
return cluster.block_rank();
|
||||
else
|
||||
return blockIdx.y;
|
||||
}
|
||||
template <>
|
||||
__device__ double2 *
|
||||
get_join_buffer_element(int level_id, int glwe_id, cluster_group &cluster,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm) {
|
||||
double2 *buffer_slice;
|
||||
if (support_dsm) {
|
||||
extern __shared__ double2 smem[];
|
||||
buffer_slice = cluster.map_shared_rank(
|
||||
smem, glwe_id + level_id * (glwe_dimension + 1));
|
||||
} else {
|
||||
buffer_slice =
|
||||
global_memory_buffer +
|
||||
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
|
||||
}
|
||||
return buffer_slice;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,124 +1,11 @@
|
||||
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename G>
|
||||
__device__ int get_this_block_rank(G &group, bool support_dsm);
|
||||
|
||||
template <typename G>
|
||||
__device__ double2 *
|
||||
get_join_buffer_element(int level_id, int glwe_id, G &group,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm);
|
||||
|
||||
template <typename Torus, typename G, class params>
|
||||
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
double2 *join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension,
|
||||
int level_count, int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
auto buffer_slice =
|
||||
get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
group.sync();
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
group.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
auto src_acc =
|
||||
get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
#include "../../include/device.h"
|
||||
#include "../../include/programmable_bootstrap.h"
|
||||
#include "../include/device.h"
|
||||
#include "programmable_bootstrap_classic.cuh"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, void *bootstrapping_key,
|
||||
@@ -128,7 +15,6 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
|
||||
auto num_inputs_on_gpu = input_lwe_ciphertext_count / gpu_count;
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
@@ -137,11 +23,11 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case CLASSICAL:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -154,19 +40,19 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
if (grouping_factor == 0)
|
||||
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory);
|
||||
input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case CLASSICAL:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
@@ -179,15 +65,13 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
if (gpu_index != 0)
|
||||
PANIC("GPU error (pbs): all memory has to reside in GPU 0.")
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
@@ -196,9 +80,8 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case CLASSICAL:
|
||||
scratch_cuda_programmable_bootstrap_32(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
@@ -211,15 +94,14 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
if (grouping_factor == 0)
|
||||
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
|
||||
scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, grouping_factor, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case CLASSICAL:
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
@@ -230,5 +112,3 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
"moduli are supported.")
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -18,64 +18,57 @@ uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,60 +79,52 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
@@ -152,13 +137,12 @@ void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
* ciphertexts. See the corresponding operation on 64 bits for more details.
|
||||
*/
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
|
||||
@@ -167,66 +151,66 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
|
||||
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
|
||||
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
|
||||
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
|
||||
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
|
||||
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
|
||||
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
|
||||
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
@@ -301,13 +285,12 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
|
||||
@@ -316,66 +299,66 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory);
|
||||
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
@@ -388,11 +371,9 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
* This cleanup function frees the data for the amortized PBS on GPU in
|
||||
* buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, static_cast<cudaStream_t>(stream), gpu_index);
|
||||
cuda_drop_async(*pbs_buffer, stream);
|
||||
}
|
||||
|
||||
@@ -253,11 +253,10 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_amortized(
|
||||
cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
@@ -285,22 +284,22 @@ __host__ void scratch_programmable_bootstrap_amortized(
|
||||
get_buffer_size_programmable_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap_amortized(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t SM_FULL =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
@@ -328,14 +327,14 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
device_programmable_bootstrap_amortized<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
<<<grid, thds, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, lwe_idx, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, stream>>>(
|
||||
<<<grid, thds, SM_PART, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
@@ -347,7 +346,7 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
// For lower compute capabilities, this call
|
||||
// just does nothing and the amount of shared memory used is 48 KB
|
||||
device_programmable_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, stream>>>(
|
||||
<<<grid, thds, SM_FULL, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#endif
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
@@ -14,13 +15,104 @@
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
// Cooperative groups are used for this implementation
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
double2 *join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension,
|
||||
int level_count, int iteration,
|
||||
grid_group &grid) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Selects all GLWEs in a particular decomposition level
|
||||
auto level_join_buffer =
|
||||
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
grid.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel that computes the classical PBS using cooperative groups
|
||||
*
|
||||
@@ -130,7 +222,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
mul_ggsw_glwe<Torus, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
@@ -154,13 +246,11 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_cg(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
|
||||
@@ -186,7 +276,7 @@ __host__ void scratch_programmable_bootstrap_cg(
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -195,14 +285,14 @@ __host__ void scratch_programmable_bootstrap_cg(
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap_cg(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
@@ -242,18 +332,18 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
kernel_args[13] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, grid,
|
||||
thds, (void **)kernel_args, 0, stream));
|
||||
thds, (void **)kernel_args, 0, stream->stream));
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
kernel_args[13] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
grid, thds, (void **)kernel_args, partial_sm, stream));
|
||||
grid, thds, (void **)kernel_args, partial_sm, stream->stream));
|
||||
} else {
|
||||
int no_dm = 0;
|
||||
kernel_args[13] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, grid,
|
||||
thds, (void **)kernel_args, full_sm, stream));
|
||||
thds, (void **)kernel_args, full_sm, stream->stream));
|
||||
}
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef CUDA_CG_MULTIBIT_PBS_CUH
|
||||
#define CUDA_CG_MULTIBIT_PBS_CUH
|
||||
#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
|
||||
#define CUDA_FAST_MULTIBIT_PBS_CUH
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
@@ -11,7 +11,6 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
@@ -107,9 +106,9 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, keybundle,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
|
||||
block_join_buffer, keybundle, polynomial_size,
|
||||
glwe_dimension, level_count, i, grid);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
@@ -170,13 +169,13 @@ __host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
@@ -241,26 +240,23 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, max_shared_memory);
|
||||
lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_cg_external_product_loop(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) {
|
||||
__host__ void execute_external_product_loop(
|
||||
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_chunk_size, uint32_t max_shared_memory,
|
||||
int lwe_offset) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_dm =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -310,55 +306,54 @@ __host__ void execute_cg_external_product_loop(
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, NOSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, 0, stream));
|
||||
grid_accumulate, thds, (void **)kernel_args, 0, stream->stream));
|
||||
} else if (max_shared_memory < full_dm) {
|
||||
kernel_args[19] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, PARTIALSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_dm,
|
||||
stream->stream));
|
||||
} else {
|
||||
kernel_args[19] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, FULLSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
|
||||
grid_accumulate, thds, (void **)kernel_args, full_dm, stream->stream));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size, max_shared_memory);
|
||||
lwe_chunk_size = get_lwe_chunk_size(num_samples);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
stream, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, max_shared_memory,
|
||||
lwe_chunk_size, lwe_offset);
|
||||
|
||||
// Accumulate
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_chunk_size,
|
||||
max_shared_memory, lwe_offset);
|
||||
execute_external_product_loop<Torus, params>(
|
||||
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
lwe_chunk_size, max_shared_memory, lwe_offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
#include "programmable_bootstrap_cg_classic.cuh"
|
||||
#include "programmable_bootstrap_classic.cuh"
|
||||
#if (CUDA_ARCH >= 900)
|
||||
#include "programmable_bootstrap_tbc_classic.cuh"
|
||||
#endif
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
@@ -15,190 +12,6 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Returns the buffer size for 64 bits executions
|
||||
*/
|
||||
@@ -220,7 +33,7 @@ uint64_t get_buffer_size_programmable_bootstrap_64(
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -228,44 +41,37 @@ void scratch_cuda_programmable_bootstrap_cg(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
@@ -277,7 +83,7 @@ void scratch_cuda_programmable_bootstrap_cg(
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -285,45 +91,38 @@ void scratch_cuda_programmable_bootstrap(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
stream, buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -339,33 +138,23 @@ void scratch_cuda_programmable_bootstrap(
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -374,40 +163,30 @@ void scratch_cuda_programmable_bootstrap_32(
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
@@ -416,59 +195,52 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg<Torus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg<Torus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -479,9 +251,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
@@ -490,59 +262,52 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap<Torus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap<Torus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_luts, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -554,42 +319,22 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
|
||||
*/
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
"in the ciphertext representation (32)");
|
||||
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case CG:
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory))
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
@@ -599,10 +344,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case DEFAULT:
|
||||
else
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lut_vector),
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
@@ -612,10 +356,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
|
||||
@@ -691,41 +431,21 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
"in the ciphertext representation (64)");
|
||||
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if (CUDA_ARCH >= 900)
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case PBS_VARIANT::CG:
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory))
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
@@ -735,10 +455,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
else
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
@@ -748,21 +467,16 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
|
||||
* bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_programmable_bootstrap(cuda_stream_t *stream,
|
||||
int8_t **buffer) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
x->release(stream);
|
||||
}
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
@@ -770,7 +484,7 @@ template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
@@ -780,7 +494,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
@@ -790,20 +504,19 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
cuda_stream_t *stream, uint32_t *lwe_array_out,
|
||||
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
|
||||
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
@@ -813,7 +526,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
cuda_stream_t *stream, uint32_t *lwe_array_out,
|
||||
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
|
||||
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
@@ -823,54 +536,13 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
|
||||
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
|
||||
cuda_stream_t *stream, pbs_buffer<uint32_t, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
#endif
|
||||
|
||||
@@ -264,13 +264,11 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
|
||||
@@ -320,42 +318,41 @@ __host__ void scratch_programmable_bootstrap(
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *bootstrapping_key, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
|
||||
uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
<<<grid, thds, 0, stream->stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
<<<grid, thds, partial_sm, stream->stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
<<<grid, thds, full_sm, stream->stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
@@ -366,36 +363,35 @@ __host__ void execute_step_one(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_step_two(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
double2 *bootstrapping_key, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
|
||||
uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
|
||||
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_programmable_bootstrap_step_two<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
<<<grid, thds, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
<<<grid, thds, partial_sm, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
<<<grid, thds, full_sm, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
@@ -408,14 +404,14 @@ __host__ void execute_step_two(
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
@@ -440,14 +436,13 @@ __host__ void host_programmable_bootstrap(
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
execute_step_one<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one,
|
||||
full_dm_step_one);
|
||||
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
execute_step_two<Torus, params>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
|
||||
@@ -3,10 +3,6 @@
|
||||
#include "programmable_bootstrap_multibit.cuh"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
#include "programmable_bootstrap_tbc_multibit.cuh"
|
||||
#endif
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory) {
|
||||
@@ -15,62 +11,11 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -85,65 +30,65 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
case 256:
|
||||
host_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -154,9 +99,9 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -171,64 +116,64 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
case 256:
|
||||
host_multi_bit_programmable_bootstrap<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -238,22 +183,19 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
}
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory))
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
@@ -263,43 +205,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
#else
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
else
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
static_cast<uint64_t *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
@@ -307,50 +231,50 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
case 256:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
@@ -362,7 +286,7 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
@@ -372,58 +296,51 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
case 256:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -433,123 +350,65 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
else
|
||||
#endif
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
else
|
||||
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
|
||||
int8_t **buffer) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
x->release(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
|
||||
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
|
||||
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
|
||||
* - 13) for additional logic.
|
||||
*
|
||||
* The value 13 was empirically determined based on memory requirements for
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t max_shared_memory) {
|
||||
// Returns a chunk size that is not optimal but close to
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count) {
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_blocks_per_sm;
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
polynomial_size / params::opt, full_sm_keybundle);
|
||||
else
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
polynomial_size / params::opt, 0);
|
||||
|
||||
int num_sms = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
int x = num_sms * max_blocks_per_sm;
|
||||
int count = 0;
|
||||
|
||||
int divisor = 1;
|
||||
int ith_divisor = 0;
|
||||
|
||||
#if CUDA_ARCH < 900
|
||||
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
|
||||
// multiplication can run
|
||||
int log2_max_num_pbs = std::log2(max_num_pbs);
|
||||
if (log2_max_num_pbs > 13)
|
||||
ith_divisor = log2_max_num_pbs - 11;
|
||||
#if CUDA_ARCH >= 900
|
||||
// Tesla H100
|
||||
return (ct_count > 10000) ? 30 : 64;
|
||||
#elif CUDA_ARCH >= 890
|
||||
// Tesla RTX4090
|
||||
return 8;
|
||||
#elif CUDA_ARCH >= 800
|
||||
// Tesla A100
|
||||
return (ct_count > 10000) ? 30 : 45;
|
||||
#elif CUDA_ARCH >= 700
|
||||
// Tesla V100
|
||||
return (ct_count > 10000) ? 12 : 18;
|
||||
#else
|
||||
// Generic case
|
||||
return (ct_count > 10000) ? 2 : 1;
|
||||
#endif
|
||||
|
||||
for (int i = sqrt(x); i >= 1; i--) {
|
||||
if (x % i == 0) {
|
||||
if (count == ith_divisor) {
|
||||
divisor = i;
|
||||
break;
|
||||
} else {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return divisor;
|
||||
}
|
||||
|
||||
template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template void
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
@@ -561,15 +420,15 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
|
||||
template void
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template void
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
@@ -578,187 +437,3 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
"the ciphertext representation (64)");
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
template void
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template void
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
#endif
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user