Compare commits

..

2 Commits

Author SHA1 Message Date
David Testé
c721b0fdaf chore(ci): WIP test hyperstack on pre_prod slab 2024-04-10 18:05:37 +02:00
David Testé
470667507d chore(ci): update usage of slab-github-runner to last version 2024-04-10 10:28:48 +02:00
327 changed files with 11220 additions and 21338 deletions

View File

@@ -18,8 +18,8 @@ on:
pull_request:
jobs:
setup-instance:
name: Setup instance (fast-tests)
setup-ec2:
name: Setup EC2 instance (fast-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -37,21 +37,21 @@ jobs:
fast-tests:
name: Fast CPU tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -115,10 +115,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (fast-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, fast-tests ]
teardown-ec2:
name: Teardown EC2 instance (fast-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, fast-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -129,7 +129,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -137,4 +137,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -29,10 +29,10 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable

View File

@@ -18,8 +18,8 @@ on:
pull_request:
jobs:
setup-instance:
name: Setup instance (cuda-tests)
setup-ec2:
name: Setup EC2 instance (cuda-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -30,18 +30,18 @@ jobs:
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: gpu-test
cuda-pcc:
name: CUDA post-commit checks
needs: setup-instance
cuda-tests-linux:
name: CUDA tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -55,14 +55,14 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -93,64 +93,6 @@ jobs:
run: |
make pcc_gpu
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
cuda-tests-linux:
name: CUDA tests
needs: [ setup-instance, cuda-pcc ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
@@ -175,10 +117,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-pcc, cuda-tests-linux ]
teardown-ec2:
name: Teardown EC2 instance (cuda-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -187,9 +129,9 @@ jobs:
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -197,4 +139,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -18,8 +18,8 @@ on:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (unsigned-integer-tests)
setup-ec2:
name: Setup EC2 instance (unsigned-integer-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
@@ -38,21 +38,21 @@ jobs:
unsigned-integer-tests:
name: Unsigned integer tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -80,10 +80,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (unsigned-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, unsigned-integer-tests ]
teardown-ec2:
name: Teardown EC2 instance (unsigned-integer-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, unsigned-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -94,7 +94,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -102,4 +102,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -18,8 +18,8 @@ on:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (signed-integer-tests)
setup-ec2:
name: Setup EC2 instance (signed-integer-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
@@ -38,21 +38,21 @@ jobs:
signed-integer-tests:
name: Signed integer tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -84,10 +84,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (signed-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, signed-integer-tests ]
teardown-ec2:
name: Teardown EC2 instance (signed-integer-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, signed-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -98,7 +98,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -106,4 +106,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -18,8 +18,8 @@ on:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (cpu-tests)
setup-ec2:
name: Setup EC2 instance (cpu-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
@@ -38,21 +38,21 @@ jobs:
cpu-tests:
name: CPU tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -110,10 +110,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cpu-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cpu-tests ]
teardown-ec2:
name: Teardown EC2 instance (cpu-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cpu-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -124,7 +124,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -132,4 +132,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -18,8 +18,8 @@ on:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (wasm-tests)
setup-ec2:
name: Setup EC2 instance (wasm-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
@@ -38,21 +38,21 @@ jobs:
wasm-tests:
name: WASM tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -80,10 +80,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (wasm-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, wasm-tests ]
teardown-ec2:
name: Teardown EC2 instance (wasm-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, wasm-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -94,7 +94,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -102,4 +102,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -53,7 +53,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -97,13 +97,13 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_boolean
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -23,7 +23,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Install and run newline linter checks
if: matrix.os == 'ubuntu-latest'

View File

@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Get actionlint
run: |

View File

@@ -53,7 +53,7 @@ jobs:
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
@@ -63,13 +63,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
with:
files_yaml: |
tfhe:
@@ -99,7 +99,7 @@ jobs:
make test_shortint_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@5ecb98a3c6b747ed38dc09f787459979aebb39be
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
@@ -113,7 +113,7 @@ jobs:
make test_integer_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@5ecb98a3c6b747ed38dc09f787459979aebb39be
uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}

View File

@@ -53,7 +53,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -88,13 +88,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -1,24 +1,18 @@
# Run core crypto benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Core crypto GPU benchmarks
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-core-crypto-benchmarks)
setup-ec2:
name: Setup EC2 instance (cuda-benchmarks)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -29,15 +23,15 @@ jobs:
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
profile: gpu-bench
cuda-core-crypto-benchmarks:
name: Execute GPU core crypto benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
core-crypto-benchmarks:
name: CUDA core crypto benchmarks
needs: setup-ec2
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -49,21 +43,12 @@ jobs:
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -71,26 +56,22 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -122,26 +103,28 @@ jobs:
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--name-suffix avx512 \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -161,18 +144,23 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
# FIXME This action needs docker to be installed on the machine beforehand.
# - name: Slack Notification
# if: ${{ failure() }}
# continue-on-error: true
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
# env:
# SLACK_COLOR: ${{ job.status }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
# SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
teardown-instance:
name: Teardown instance (cuda-integer-full-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-core-crypto-benchmarks ]
teardown-ec2:
name: Teardown EC2 instance (cuda-benchmarks)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, core-crypto-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -181,9 +169,9 @@ jobs:
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -191,4 +179,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cuda-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -19,8 +19,8 @@ on:
jobs:
setup-instance:
name: Setup instance (csprng-randomness-tests)
setup-ec2:
name: Setup EC2 instance (csprng-randomness-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
@@ -39,21 +39,21 @@ jobs:
csprng-randomness-tests:
name: CSPRNG randomness tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -69,10 +69,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (csprng-randomness-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, csprng-randomness-tests ]
teardown-ec2:
name: Teardown EC2 instance (csprng-randomness-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, csprng-randomness-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -83,7 +83,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -91,4 +91,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -39,7 +39,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -52,12 +52,12 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -81,7 +81,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -120,7 +120,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -133,12 +133,12 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -163,7 +163,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
@@ -194,7 +194,7 @@ jobs:
name: Remove 4090 bench label
if: ${{ always() && github.event_name == 'pull_request' }}
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
runs-on: ubuntu-latest
runs-on: ["self-hosted", "4090-desktop"]
steps:
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
with:

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -74,7 +74,7 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -92,12 +92,12 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -121,7 +121,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -1,11 +1,10 @@
# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Integer GPU benchmarks
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
push:
branches:
- main
pull_request:
env:
CARGO_TERM_COLOR: always
@@ -14,14 +13,10 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-benchmarks)
setup-ec2:
name: Setup EC2 instance (cuda-benchmarks)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -32,15 +27,15 @@ jobs:
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
profile: gpu-bench
cuda-integer-benchmarks:
name: Execute GPU integer benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
name: CUDA integer benchmarks
needs: setup-ec2
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -52,21 +47,12 @@ jobs:
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -74,26 +60,22 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -127,33 +109,35 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware "n2-H100x1" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -173,23 +157,23 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-benchmarks]
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
# FIXME This action needs docker to be installed on the machine beforehand.
# - name: Slack Notification
# if: ${{ !success() && !cancelled() }}
# continue-on-error: true
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
# env:
# SLACK_COLOR: ${{ job.status }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
# SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
teardown-instance:
name: Teardown instance (cuda-integer-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-benchmarks ]
teardown-ec2:
name: Teardown EC2 instance (cuda-benchmarks)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cuda-integer-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -198,9 +182,9 @@ jobs:
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -208,4 +192,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cuda-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,11 +1,10 @@
# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Integer GPU full benchmarks
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
pull_request:
env:
CARGO_TERM_COLOR: always
@@ -13,14 +12,10 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-full-benchmarks)
setup-ec2:
name: Setup EC2 instance (cuda-full-benchmarks)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -31,15 +26,15 @@ jobs:
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
profile: gpu-bench
cuda-integer-full-benchmarks:
name: Execute GPU integer benchmarks for all operations flavor
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
name: CUDA integer full benchmarks
needs: setup-ec2
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
continue-on-error: true
strategy:
@@ -56,21 +51,12 @@ jobs:
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -79,7 +65,7 @@ jobs:
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -97,7 +83,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -122,7 +108,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -136,7 +122,7 @@ jobs:
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware "n2-H100x1" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
@@ -147,7 +133,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -166,18 +152,23 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ !success() && !cancelled() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
# FIXME This action needs docker to be installed on the machine beforehand.
# - name: Slack Notification
# if: ${{ !success() && !cancelled() }}
# continue-on-error: true
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
# env:
# SLACK_COLOR: ${{ job.status }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
# SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
teardown-instance:
name: Teardown instance (cuda-integer-full-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-full-benchmarks ]
teardown-ec2:
name: Teardown EC2 instance (cuda-full-benchmarks)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cuda-integer-full-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -186,9 +177,9 @@ jobs:
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -196,4 +187,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cuda-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -1,11 +1,10 @@
# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
name: Integer GPU Multi-bit benchmarks
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
pull_request:
env:
CARGO_TERM_COLOR: always
@@ -14,14 +13,10 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-multi-bit-benchmarks)
setup-ec2:
name: Setup EC2 instance (cuda-multi-bit-benchmarks)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -32,15 +27,15 @@ jobs:
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
profile: gpu-bench
cuda-integer-multi-bit-benchmarks:
name: Execute GPU integer multi-bit benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
name: CUDA integer multi-bit benchmarks
needs: setup-ec2
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
strategy:
fail-fast: false
@@ -53,21 +48,12 @@ jobs:
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -75,26 +61,22 @@ jobs:
make -j"$(nproc)"
sudo make install
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -128,33 +110,35 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware "n2-H100x1" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -174,18 +158,23 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ !success() && !cancelled() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
# FIXME This action needs docker to be installed on the machine beforehand.
# - name: Slack Notification
# if: ${{ !success() && !cancelled() }}
# continue-on-error: true
# uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
# env:
# SLACK_COLOR: ${{ job.status }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
# SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
# SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
teardown-instance:
name: Teardown instance (cuda-integer-full-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
teardown-ec2:
name: Teardown EC2 instance (cuda-multi-bit-benchmarks)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cuda-integer-multi-bit-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -194,9 +183,9 @@ jobs:
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -204,4 +193,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cuda-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -31,10 +31,10 @@ jobs:
timeout-minutes: 720
steps:
- uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -86,13 +86,6 @@ jobs:
run: |
make test_boolean
# Because we do "illegal" things with the build system which Cargo does not seem to like much
# we need to clear the cache to make sure the C API is built properly and does not use a stale
# cached version
- name: Clear build cache
run: |
cargo clean
- name: Run C API tests
run: |
make test_c_api

View File

@@ -30,7 +30,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0

View File

@@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0

View File

@@ -21,8 +21,8 @@ env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (publish-cuda-release)
setup-ec2:
name: Setup EC2 instance (publish-cuda-release)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -40,8 +40,8 @@ jobs:
publish-cuda-release:
name: Publish CUDA Release
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
needs: setup-ec2
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -54,7 +54,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: stable
@@ -104,10 +104,10 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (publish-release)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, publish-cuda-release ]
teardown-ec2:
name: Teardown EC2 instance (publish-release)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, publish-cuda-release ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
@@ -118,7 +118,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
@@ -126,4 +126,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -17,10 +17,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
- name: Checkout lattice-estimator
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: malb/lattice-estimator
path: lattice_estimator

View File

@@ -45,7 +45,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -55,7 +55,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -89,13 +89,13 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_shortint
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -53,7 +53,7 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -71,12 +71,12 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -115,7 +115,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -52,7 +52,7 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -70,12 +70,12 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab
@@ -99,7 +99,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -46,7 +46,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -56,7 +56,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -70,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -36,6 +36,10 @@ on:
description: "Run core crypto benches"
type: boolean
default: true
core_crypto_gpu_bench:
description: "Run core crypto benches on GPU"
type: boolean
default: true
wasm_client_bench:
description: "Run WASM client benches"
type: boolean
@@ -49,17 +53,18 @@ jobs:
command: [ boolean_bench, shortint_bench,
integer_bench, integer_multi_bit_bench,
signed_integer_bench, signed_integer_multi_bit_bench,
core_crypto_bench, wasm_client_bench ]
integer_gpu_bench, integer_multi_bit_gpu_bench,
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
with:
files_yaml: |
common_benches:
@@ -106,7 +111,7 @@ jobs:
- .github/workflows/wasm_client_benchmark.yml
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -25,17 +25,17 @@ jobs:
strategy:
matrix:
command: [ boolean_bench, shortint_full_bench,
integer_full_bench, signed_integer_full_bench,
core_crypto_bench, wasm_client_bench ]
integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -13,9 +13,14 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
- name: Save repo
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: repo-archive
path: '.'
- name: git-sync
uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
with:

View File

@@ -53,7 +53,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
fetch-depth: 0
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
with:
toolchain: nightly
@@ -98,13 +98,13 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_wasm
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
with:
repository: zama-ai/slab
path: slab

View File

@@ -160,12 +160,6 @@ check_nvm_installed:
@source ~/.nvm/nvm.sh && nvm --version > /dev/null 2>&1 || \
( echo "Unable to locate Node. Run 'make install_node'" && exit 1 )
.PHONY: install_mlc # Install mlc (Markup Link Checker)
install_mlc: install_rs_build_toolchain
@mlc --version > /dev/null 2>&1 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install mlc --locked || \
( echo "Unable to install mlc, unknown error." && exit 1 )
.PHONY: fmt # Format rust code
fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
@@ -277,7 +271,7 @@ clippy_js_wasm_api: install_rs_check_toolchain
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
clippy_tasks: install_rs_check_toolchain
clippy_tasks:
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-p tasks -- --no-deps -D warnings
@@ -287,19 +281,19 @@ clippy_trivium: install_rs_check_toolchain
-p tfhe-trivium -- --no-deps -D warnings
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
clippy_all_targets: install_rs_check_toolchain
clippy_all_targets:
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
clippy_concrete_csprng: install_rs_check_toolchain
clippy_concrete_csprng:
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE) \
-p concrete-csprng -- --no-deps -D warnings
.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
clippy_zk_pok: install_rs_check_toolchain
clippy_zk_pok:
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-zk-pok -- --no-deps -D warnings
@@ -382,7 +376,7 @@ build_c_api_gpu: install_rs_check_toolchain
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
@@ -450,14 +444,14 @@ test_cuda_backend:
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_core_crypto_gpu: install_rs_build_toolchain
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_gpu: install_rs_build_toolchain
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
@@ -485,14 +479,14 @@ test_c_api_rs: install_rs_check_toolchain
.PHONY: test_c_api_c # Run the C tests for the C API
test_c_api_c: build_c_api
./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"
./scripts/c_api_tests.sh
.PHONY: test_c_api # Run all the tests for the C API
test_c_api: test_c_api_rs test_c_api_c
.PHONY: test_c_api_gpu # Run the C tests for the C API
test_c_api_gpu: build_c_api_gpu
./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"
./scripts/c_api_tests.sh --gpu
.PHONY: test_shortint_ci # Run the tests for shortint ci
test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
@@ -644,12 +638,12 @@ test_kreyvium: install_rs_build_toolchain
-p tfhe-trivium -- --test-threads=1 kreyvium::
.PHONY: test_concrete_csprng # Run concrete-csprng tests
test_concrete_csprng: install_rs_build_toolchain
test_concrete_csprng:
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
test_zk_pok: install_rs_build_toolchain
test_zk_pok:
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok
@@ -688,23 +682,15 @@ format_doc_latex:
check_md_docs_are_tested:
RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
.PHONY: check_intra_md_links # Checks broken internal links in Markdown docs
check_intra_md_links: install_mlc
mlc --offline --match-file-extension tfhe/docs
.PHONY: check_md_links # Checks all broken links in Markdown docs
check_md_links: install_mlc
mlc --match-file-extension tfhe/docs
.PHONY: check_compile_tests # Build tests in debug without running them
check_compile_tests: install_rs_build_toolchain
check_compile_tests:
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
-p $(TFHE_SPEC)
@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
"$(MAKE)" build_c_api && \
./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
./scripts/c_api_tests.sh --build-only; \
fi
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
@@ -827,6 +813,8 @@ bench_oprf: install_rs_check_toolchain
--bench oprf-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
bench_shortint_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -835,6 +823,7 @@ bench_shortint_multi_bit: install_rs_check_toolchain
--bench shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_boolean # Run benchmarks for boolean
bench_boolean: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -878,7 +867,6 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
#
# Utility tools
#
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
gen_key_cache: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
@@ -954,8 +942,8 @@ sha256_bool: install_rs_check_toolchain
--features=$(TARGET_ARCH_FEATURE),boolean
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
clippy_all check_compile_tests
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_all \
check_compile_tests
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu

View File

@@ -71,7 +71,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
}
fn main() {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -143,7 +143,7 @@ use tfhe::prelude::*;
use tfhe_trivium::TriviumStreamShortint;
fn test_shortint() {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::all_disabled().enable_default_integers().build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));

View File

@@ -13,7 +13,6 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
[build-dependencies]
cmake = { version = "0.1" }
pkg-config = { version = "0.3" }
[dependencies]
thiserror = "1.0"

View File

@@ -21,15 +21,7 @@ fn main() {
let dest = cmake::build("cuda");
println!("cargo:rustc-link-search=native={}", dest.display());
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
// Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
if pkg_config::Config::new()
.atleast_version("10")
.probe("cuda")
.is_err()
{
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
}
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
println!("cargo:rustc-link-lib=gomp");
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e

View File

@@ -4,14 +4,14 @@
#include <cstdint>
extern "C" {
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);
};

View File

@@ -8,6 +8,7 @@
#include <cuda_runtime.h>
#define synchronize_threads_in_block() __syncthreads()
extern "C" {
#define check_cuda_error(ans) \
@@ -26,33 +27,47 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort(); \
}
cudaStream_t cuda_create_stream(uint32_t gpu_index);
struct cuda_stream_t {
cudaStream_t stream;
uint32_t gpu_index;
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t(uint32_t gpu_index) {
this->gpu_index = gpu_index;
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
check_cuda_error(cudaStreamCreate(&stream));
}
void release() {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamDestroy(stream));
}
void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
};
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
void cuda_destroy_stream(cuda_stream_t *stream);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
bool cuda_check_support_cooperative_groups();
bool cuda_check_support_thread_block_clusters();
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
int cuda_get_number_of_gpus();
@@ -60,18 +75,20 @@ void cuda_synchronize_device(uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cuda_stream_t *stream);
int cuda_get_max_shared_memory(uint32_t gpu_index);
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
void cuda_synchronize_stream(cuda_stream_t *stream);
void cuda_stream_add_callback(cuda_stream_t *stream,
cudaStreamCallback_t callback, void *user_data);
}
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer);
}
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
Torus n);
#endif

View File

@@ -1,10 +0,0 @@
#ifndef HELPER_H
#define HELPER_H
extern "C" {
int cuda_setup_multi_gpu();
}
void multi_gpu_checks(uint32_t gpu_count);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -6,16 +6,16 @@
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
}
#endif // CNCRT_KS_H_

View File

@@ -7,42 +7,42 @@
extern "C" {
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
}

View File

@@ -5,89 +5,80 @@
#include <cstdint>
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1 };
extern "C" {
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
void *input1, void *input2, void *output,
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
cuda_stream_t *stream,
uint32_t polynomial_size,
uint32_t total_polynomials);
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_amortized_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
uint32_t gpu_index,
void cleanup_cuda_programmable_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer);
void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
void cleanup_cuda_programmable_bootstrap(cuda_stream_t *stream,
int8_t **pbs_buffer);
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
@@ -120,28 +111,6 @@ get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // tbc
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
@@ -156,11 +125,6 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
@@ -171,14 +135,13 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
PBS_VARIANT pbs_variant;
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory) {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
if (allocate_gpu_memory) {
switch (pbs_variant) {
@@ -210,17 +173,17 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
level_count * (glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
stream);
global_accumulator = (Torus *)cuda_malloc_async(
(glwe_dimension + 1) * input_lwe_ciphertext_count *
polynomial_size * sizeof(Torus),
stream, gpu_index);
stream);
} break;
case PBS_VARIANT::CG: {
uint64_t full_sm =
@@ -243,73 +206,25 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream, gpu_index);
stream);
} break;
#if CUDA_ARCH >= 900
case PBS_VARIANT::TBC: {
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_dsm)
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
Torus>(polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream, gpu_index);
} break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
}
}
void release(cudaStream_t stream, uint32_t gpu_index) {
cuda_drop_async(d_mem, stream, gpu_index);
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
void release(cuda_stream_t *stream) {
cuda_drop_async(d_mem, stream);
cuda_drop_async(global_accumulator_fft, stream);
if (pbs_variant == DEFAULT)
cuda_drop_async(global_accumulator, stream, gpu_index);
cuda_drop_async(global_accumulator, stream);
}
};
@@ -348,9 +263,9 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
@@ -358,54 +273,28 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
#if (CUDA_ARCH >= 900)
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
#endif
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t max_shared_memory);
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,

View File

@@ -11,98 +11,59 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t num_samples, uint32_t max_shared_memory);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor);
void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t chunk_size = 0);
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t chunk_size = 0);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
void scratch_cuda_generic_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
void cuda_generic_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
int8_t **pbs_buffer);
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template <typename Torus, typename STorus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);
#endif
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -111,7 +72,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
template <typename Torus, typename STorus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
@@ -119,9 +80,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -152,25 +113,12 @@ template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
double2 *keybundle_fft;
Torus *global_accumulator;
@@ -178,27 +126,25 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
PBS_VARIANT pbs_variant;
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
// default
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
Torus>(polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
polynomial_size);
uint64_t partial_sm_accumulate_step_one =
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
Torus>(polynomial_size);
// cg
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
polynomial_size);
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
@@ -216,124 +162,80 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
auto num_blocks_acc_cg =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
#if CUDA_ARCH >= 900
uint64_t full_sm_tbc_accumulate =
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_tbc_accumulate =
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
auto num_blocks_acc_tbc = num_blocks_acc_cg;
#endif
if (allocate_gpu_memory) {
// Keybundle
if (max_shared_memory < full_sm_keybundle)
d_mem_keybundle = (int8_t *)cuda_malloc_async(
num_blocks_keybundle * full_sm_keybundle, stream, gpu_index);
num_blocks_keybundle * full_sm_keybundle, stream);
switch (pbs_variant) {
case PBS_VARIANT::CG:
// Accumulator CG
if (max_shared_memory < partial_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index);
else if (max_shared_memory < full_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index);
break;
case PBS_VARIANT::DEFAULT:
case DEFAULT:
// Accumulator step one
if (max_shared_memory < partial_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
gpu_index);
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream);
else if (max_shared_memory < full_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
gpu_index);
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream);
// Accumulator step two
if (max_shared_memory < full_sm_accumulate_step_two)
d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
gpu_index);
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream);
break;
#if CUDA_ARCH >= 900
case TBC:
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
// partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
// will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
// Accumulator TBC
if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index);
else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream,
gpu_index);
case CG:
// Accumulator CG
if (max_shared_memory < partial_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * full_sm_cg_accumulate, stream);
else if (max_shared_memory < full_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
keybundle_fft = (double2 *)cuda_malloc_async(
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
stream);
global_accumulator = (Torus *)cuda_malloc_async(
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
gpu_index);
num_blocks_acc_step_two * polynomial_size * sizeof(Torus), stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
stream);
}
}
void release(cudaStream_t stream, uint32_t gpu_index) {
void release(cuda_stream_t *stream) {
if (d_mem_keybundle)
cuda_drop_async(d_mem_keybundle, stream, gpu_index);
cuda_drop_async(d_mem_keybundle, stream);
switch (pbs_variant) {
case DEFAULT:
if (d_mem_acc_step_one)
cuda_drop_async(d_mem_acc_step_one, stream, gpu_index);
cuda_drop_async(d_mem_acc_step_one, stream);
if (d_mem_acc_step_two)
cuda_drop_async(d_mem_acc_step_two, stream, gpu_index);
cuda_drop_async(d_mem_acc_step_two, stream);
break;
case CG:
if (d_mem_acc_cg)
cuda_drop_async(d_mem_acc_cg, stream, gpu_index);
cuda_drop_async(d_mem_acc_cg, stream);
break;
#if CUDA_ARCH >= 900
case TBC:
if (d_mem_acc_tbc)
cuda_drop_async(d_mem_acc_tbc, stream, gpu_index);
break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
cuda_drop_async(keybundle_fft, stream, gpu_index);
cuda_drop_async(global_accumulator, stream, gpu_index);
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
cuda_drop_async(keybundle_fft, stream);
cuda_drop_async(global_accumulator, stream);
cuda_drop_async(global_accumulator_fft, stream);
}
};
template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t max_shared_memory);
#ifdef __CUDACC__
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count);
#endif
#endif // CUDA_MULTI_BIT_H

View File

@@ -10,8 +10,7 @@ set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper.h)
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)

View File

@@ -1,21 +1 @@
#include "ciphertext.cuh"
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
(uint64_t *)src, number_of_cts, lwe_dimension);
}
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
(uint64_t *)src, number_of_cts, lwe_dimension);
}

View File

@@ -6,23 +6,39 @@
#include <cstdint>
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
cuda_memcpy_async_to_gpu(dest, src, size, stream);
}
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
}
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
cuda_memcpy_async_to_cpu(dest, src, size, stream);
}
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
}
#endif

View File

@@ -49,15 +49,11 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
* global memory
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, double2 *dest, T *src,
void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t max_shared_memory) {
if (gpu_count != 1)
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
"supported yet.")
cudaSetDevice(gpu_indexes[0]);
uint32_t gpu_index, uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
@@ -66,11 +62,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
if (max_shared_memory < shared_memory_size) {
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
<<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
<<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
} else {
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
<<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
d_mem);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
d_mem);
}
check_cuda_error(cudaGetLastError());
}

View File

@@ -6,13 +6,12 @@
* Head out to the equivalent operation on 64 bits for more details.
*/
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
@@ -36,13 +35,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),

View File

@@ -98,12 +98,12 @@ keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
/// assume lwe_array_in in the gpu
template <typename Torus>
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
constexpr int ideal_threads = 128;
int lwe_size = lwe_dimension_out + 1;
@@ -124,14 +124,13 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
int shared_mem = sizeof(Torus) * lwe_size;
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream,
gpu_index);
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
check_cuda_error(cudaGetLastError());
dim3 grid(num_samples, 1, 1);
dim3 threads(ideal_threads, 1, 1);
keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
lwe_upper, cutoff);

View File

@@ -3,23 +3,14 @@
#include <cuda_runtime.h>
/// Unsafe function to create a CUDA stream, must check first that GPU exists
cudaStream_t cuda_create_stream(uint32_t gpu_index) {
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cudaStream_t stream;
check_cuda_error(cudaStreamCreate(&stream));
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
return stream;
}
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamDestroy(stream));
}
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamSynchronize(stream));
}
void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
/// Unsafe function that will try to allocate even if gpu_index is invalid
/// or if there's not enough memory. A safe wrapper around it must call
@@ -34,22 +25,20 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
/// Allocates a size-byte array at the device memory. Tries to do it
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
check_cuda_error(cudaSetDevice(stream->gpu_index));
void *ptr;
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
cudaDevAttrMemoryPoolsSupported,
stream->gpu_index));
if (support_async_alloc) {
cuda_synchronize_stream(stream, gpu_index);
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream));
cuda_synchronize_stream(stream, gpu_index);
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
} else {
check_cuda_error(cudaMalloc((void **)&ptr, size));
}
@@ -82,60 +71,46 @@ bool cuda_check_support_cooperative_groups() {
return cooperative_groups_supported > 0;
}
/// Returns
/// false if Thread Block Cluster is not supported.
/// true otherwise
bool cuda_check_support_thread_block_clusters() {
#if CUDA_ARCH >= 900
// To-do: Is this really the best way to check support?
int tbc_supported = 0;
check_cuda_error(
cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
return tbc_supported > 0;
#else
return false;
#endif
}
/// Copy memory to the GPU asynchronously
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
}
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
}
/// Copy memory within a GPU asynchronously
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
if (attr_dest.device != gpu_index && attr_dest.type != cudaMemoryTypeDevice) {
if (attr_dest.device != stream->gpu_index &&
attr_dest.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
}
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
if (attr_src.device != gpu_index && attr_src.type != cudaMemoryTypeDevice) {
if (attr_src.device != stream->gpu_index &&
attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
}
if (attr_src.device != attr_dest.device) {
PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
}
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
stream->stream));
}
/// Synchronizes device
@@ -145,16 +120,16 @@ void cuda_synchronize_device(uint32_t gpu_index) {
}
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
}
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
}
template <typename Torus>
@@ -165,45 +140,42 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
}
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n) {
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
Torus n) {
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
}
check_cuda_error(cudaSetDevice(gpu_index));
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
n);
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
n);
check_cuda_error(cudaGetLastError());
}
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
uint64_t *d_array, uint64_t value,
uint64_t n);
template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
uint32_t *d_array, uint32_t value,
uint32_t n);
template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
uint64_t value, uint64_t n);
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
uint32_t value, uint32_t n);
/// Copy memory to the CPU asynchronously
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
}
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
}
/// Return number of GPUs available
@@ -220,18 +192,19 @@ void cuda_drop(void *ptr, uint32_t gpu_index) {
}
/// Drop a cuda array asynchronously, if supported on the device
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaSetDevice(stream->gpu_index));
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
cudaDevAttrMemoryPoolsSupported,
stream->gpu_index));
if (support_async_alloc) {
check_cuda_error(cudaFreeAsync(ptr, stream));
check_cuda_error(cudaFreeAsync(ptr, stream->stream));
} else {
check_cuda_error(cudaFree(ptr));
}
@@ -250,11 +223,13 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
return max_shared_memory;
}
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
void cuda_stream_add_callback(cuda_stream_t *stream,
cudaStreamCallback_t callback, void *user_data) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
check_cuda_error(
cudaStreamAddCallback(stream->stream, callback, user_data, 0));
}
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,

View File

@@ -1,7 +1,7 @@
#include "integer/bitwise_ops.cuh"
void scratch_cuda_integer_radix_bitop_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -15,19 +15,17 @@ void scratch_cuda_integer_radix_bitop_kb_64(
message_modulus, carry_modulus);
scratch_cuda_integer_radix_bitop_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
op_type, allocate_gpu_memory);
stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
params, op_type, allocate_gpu_memory);
}
void cuda_bitop_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count) {
host_integer_radix_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
@@ -35,22 +33,19 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
}
void cuda_bitnot_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t lwe_ciphertext_count) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
host_integer_radix_bitnot_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_bitop(void *stream, uint32_t gpu_index,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
int_bitop_buffer<uint64_t> *mem_ptr =
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -13,8 +13,7 @@
template <typename Torus>
__host__ void
host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_1, Torus *lwe_array_2,
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
@@ -22,32 +21,32 @@ host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
auto lut = mem_ptr->lut;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
bsk, ksk, num_radix_blocks, lut);
}
template <typename Torus>
__host__ void host_integer_radix_bitnot_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer<Torus> *mem_ptr,
void *bsk, Torus *ksk, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk,
stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
num_radix_blocks, lut);
}
template <typename Torus>
__host__ void
host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in,
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_bitop_kb(
cudaStream_t stream, uint32_t gpu_index, int_bitop_buffer<Torus> **mem_ptr,
cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
*mem_ptr = new int_bitop_buffer<Torus>(stream, gpu_index, op, params,
num_radix_blocks, allocate_gpu_memory);
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
allocate_gpu_memory);
}
#endif

View File

@@ -1,7 +1,7 @@
#include "integer/cmux.cuh"
void scratch_cuda_integer_radix_cmux_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -17,20 +17,17 @@ void scratch_cuda_integer_radix_cmux_kb_64(
[](uint64_t x) -> uint64_t { return x == 1; };
scratch_cuda_integer_radix_cmux_kb(
static_cast<cudaStream_t>(stream), gpu_index,
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
lwe_ciphertext_count, params, allocate_gpu_memory);
}
void cuda_cmux_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
void *lwe_array_false, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t lwe_ciphertext_count) {
host_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_condition),
static_cast<uint64_t *>(lwe_array_true),
static_cast<uint64_t *>(lwe_array_false),
@@ -39,10 +36,10 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
lwe_ciphertext_count);
}
void cleanup_cuda_integer_radix_cmux(void *stream, uint32_t gpu_index,
void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_cmux_buffer<uint64_t> *mem_ptr =
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -5,13 +5,12 @@
#include <omp.h>
template <typename Torus>
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_input, Torus *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
int big_lwe_size = params.big_lwe_dimension + 1;
@@ -28,7 +27,8 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
stream->stream>>>(
lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
params.message_modulus, 1);
@@ -36,23 +36,23 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsk,
ksk, num_radix_blocks, predicate);
stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
predicate);
}
template <typename Torus>
__host__ void host_integer_radix_cmux_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
__host__ void
host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_condition, Torus *lwe_array_true,
Torus *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
// Since our CPU threads will be working on different streams we shall assert
// the work in the main stream is completed
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
stream->synchronize();
auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
@@ -62,43 +62,41 @@ __host__ void host_integer_radix_cmux_kb(
#pragma omp section
{
auto mem_true = mem_ptr->zero_if_true_buffer;
zero_out_if(&true_stream, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
lwe_array_true, lwe_condition, mem_true,
mem_ptr->inverted_predicate_lut, bsk, ksk, num_radix_blocks);
zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
ksk, num_radix_blocks);
}
#pragma omp section
{
auto mem_false = mem_ptr->zero_if_false_buffer;
zero_out_if(&false_stream, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
lwe_array_false, lwe_condition, mem_false,
mem_ptr->predicate_lut, bsk, ksk, num_radix_blocks);
zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
num_radix_blocks);
}
}
cuda_synchronize_stream(true_stream, gpu_indexes[0]);
cuda_synchronize_stream(false_stream, gpu_indexes[0]);
cuda_synchronize_stream(true_stream);
cuda_synchronize_stream(false_stream);
// If the condition was true, true_ct will have kept its value and false_ct
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
// have kept its value
auto added_cts = mem_ptr->tmp_true_ct;
host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
mem_ptr->tmp_false_ct, params.big_lwe_dimension,
num_radix_blocks);
host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
params.big_lwe_dimension, num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsk, ksk,
num_radix_blocks, mem_ptr->message_extract_lut);
stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
mem_ptr->message_extract_lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_cmux_kb(
cudaStream_t stream, uint32_t gpu_index, int_cmux_buffer<Torus> **mem_ptr,
cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
*mem_ptr =
new int_cmux_buffer<Torus>(stream, gpu_index, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory);
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory);
}
#endif

View File

@@ -1,7 +1,7 @@
#include "integer/comparison.cuh"
void scratch_cuda_integer_radix_comparison_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -18,9 +18,8 @@ void scratch_cuda_integer_radix_comparison_kb_64(
case EQ:
case NE:
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
op_type, false, allocate_gpu_memory);
stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
params, op_type, false, allocate_gpu_memory);
break;
case GT:
case GE:
@@ -29,17 +28,16 @@ void scratch_cuda_integer_radix_comparison_kb_64(
case MAX:
case MIN:
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
op_type, is_signed, allocate_gpu_memory);
stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
params, op_type, is_signed, allocate_gpu_memory);
break;
}
}
void cuda_comparison_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void *bsk, void *ksk, uint32_t num_radix_blocks) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_radix_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -47,8 +45,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
case EQ:
case NE:
host_integer_radix_equality_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
static_cast<uint64_t *>(ksk), num_radix_blocks);
@@ -58,8 +55,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
case LT:
case LE:
host_integer_radix_difference_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer,
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
@@ -68,8 +64,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
case MAX:
case MIN:
host_integer_radix_maxmin_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
static_cast<uint64_t *>(ksk), num_radix_blocks);
@@ -79,10 +74,10 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
}
}
void cleanup_cuda_integer_comparison(void *stream, uint32_t gpu_index,
void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -33,17 +33,16 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
}
template <typename Torus>
__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
Torus *output, Torus *input,
uint32_t lwe_dimension,
__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
Torus *input, uint32_t lwe_dimension,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
// Add all blocks and store in sum
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
output, input, lwe_dimension, num_radix_blocks);
check_cuda_error(cudaGetLastError());
}
@@ -57,13 +56,12 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
*/
template <typename Torus>
__host__ void
are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -78,10 +76,9 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
tmp_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
uint32_t remaining_blocks = num_radix_blocks;
@@ -95,8 +92,8 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
auto input_blocks = tmp_out;
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension, chunk_length);
accumulate_all_blocks(stream, accumulator, input_blocks,
big_lwe_dimension, chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
@@ -118,18 +115,16 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
lut = (*is_equal_to_num_blocks_map)[chunk_length];
} else {
// LUT needs to be computed
auto new_lut =
new int_radix_lut<Torus>(streams[0], gpu_indexes[0], params,
max_value, num_radix_blocks, true);
auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
num_radix_blocks, true);
auto is_equal_to_num_blocks_lut_f = [max_value,
chunk_length](Torus x) -> Torus {
return (x & max_value) == chunk_length;
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], new_lut->lut, glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f);
stream, new_lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
lut = new_lut;
@@ -140,13 +135,11 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
1, lut);
stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsk, ksk,
num_chunks, lut);
stream, tmp_out, accumulator, bsk, ksk, num_chunks, lut);
}
}
}
@@ -159,12 +152,9 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
*/
template <typename Torus>
__host__ void is_at_least_one_comparisons_block_true(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -175,10 +165,9 @@ __host__ void is_at_least_one_comparisons_block_true(
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
mem_ptr->tmp_lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 0) {
@@ -191,8 +180,8 @@ __host__ void is_at_least_one_comparisons_block_true(
auto input_blocks = mem_ptr->tmp_lwe_array_out;
auto accumulator = buffer->tmp_block_accumulated;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension, chunk_length);
accumulate_all_blocks(stream, accumulator, input_blocks,
big_lwe_dimension, chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
@@ -207,13 +196,12 @@ __host__ void is_at_least_one_comparisons_block_true(
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
1, lut);
stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
accumulator, bsk, ksk, num_chunks, lut);
stream, mem_ptr->tmp_lwe_array_out, accumulator, bsk, ksk, num_chunks,
lut);
}
}
}
@@ -239,12 +227,11 @@ __host__ void is_at_least_one_comparisons_block_true(
// are_all_comparisons_block_true
template <typename Torus>
__host__ void host_compare_with_zero_equality(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -269,8 +256,7 @@ __host__ void host_compare_with_zero_equality(
if (num_radix_blocks == 1) {
// Just copy
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
num_sum_blocks = 1;
} else {
uint32_t remainder_blocks = num_radix_blocks;
@@ -280,8 +266,8 @@ __host__ void host_compare_with_zero_equality(
uint32_t chunk_size =
std::min(remainder_blocks, num_elements_to_fill_carry);
accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
big_lwe_dimension, chunk_size);
accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
chunk_size);
num_sum_blocks++;
remainder_blocks -= (chunk_size - 1);
@@ -293,46 +279,41 @@ __host__ void host_compare_with_zero_equality(
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, sum, sum, bsk, ksk, num_sum_blocks,
zero_comparison);
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
sum, mem_ptr, bsk, ksk, num_sum_blocks);
stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
num_sum_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_equality_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto eq_buffer = mem_ptr->eq_buffer;
// Applies the LUT for the comparison operation
auto comparisons = mem_ptr->tmp_block_comparisons;
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
bsk, ksk, num_radix_blocks, eq_buffer->operator_lut);
stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
eq_buffer->operator_lut);
// This takes a Vec of blocks, where each block is either 0 or 1.
//
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr, bsk, ksk,
num_radix_blocks);
are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
bsk, ksk, num_radix_blocks);
}
template <typename Torus>
__host__ void
compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -353,21 +334,21 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
lwe_array_right, big_lwe_dimension, num_radix_blocks);
host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
big_lwe_dimension, num_radix_blocks);
// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsk, ksk,
num_radix_blocks, is_non_zero_lut);
stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
is_non_zero_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace(
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
big_lwe_dimension, num_radix_blocks,
message_modulus, carry_modulus);
}
// Reduces a vec containing shortint blocks that encrypts a sign
@@ -375,14 +356,13 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// final sign
template <typename Torus>
__host__ void
tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = tree_buffer->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -401,19 +381,16 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
auto y = tree_buffer->tmp_y;
if (x != lwe_block_comparisons)
cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
big_lwe_size_bytes * num_radix_blocks,
streams[0], gpu_indexes[0]);
big_lwe_size_bytes * num_radix_blocks, stream);
uint32_t partial_block_count = num_radix_blocks;
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
while (partial_block_count > 2) {
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, x, y, bsk, ksk,
partial_block_count >> 1, inner_tree_leaf);
stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
if ((partial_block_count % 2) != 0) {
partial_block_count >>= 1;
@@ -423,8 +400,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
big_lwe_size_bytes, stream);
} else {
partial_block_count >>= 1;
}
@@ -435,8 +411,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
std::function<Torus(Torus)> f;
if (partial_block_count == 2) {
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
int msb = (x >> 2) & 3;
@@ -450,24 +425,23 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
y = x;
f = sign_handler_f;
}
generate_device_accumulator<Torus>(streams[0], gpu_indexes[0], last_lut->lut,
glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, f);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsk, ksk, 1, last_lut);
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
ksk, 1, last_lut);
}
template <typename Torus>
__host__ void host_integer_radix_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto diff_buffer = mem_ptr->diff_buffer;
auto params = mem_ptr->params;
@@ -489,21 +463,21 @@ __host__ void host_integer_radix_difference_check_kb(
if (mem_ptr->is_signed) {
packed_num_radix_blocks -= 2;
}
pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
packed_num_radix_blocks, message_modulus);
pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
packed_num_radix_blocks, message_modulus);
// From this point we have half number of blocks
packed_num_radix_blocks /= 2;
// Clean noise
auto identity_lut = mem_ptr->identity_lut;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsk, ksk,
packed_num_radix_blocks, identity_lut);
stream, packed_left, packed_left, bsk, ksk, packed_num_radix_blocks,
identity_lut);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, packed_right, packed_right, bsk, ksk,
packed_num_radix_blocks, identity_lut);
stream, packed_right, packed_right, bsk, ksk, packed_num_radix_blocks,
identity_lut);
lhs = packed_left;
rhs = packed_right;
@@ -518,15 +492,15 @@ __host__ void host_integer_radix_difference_check_kb(
if (!mem_ptr->is_signed) {
// Compare packed blocks, or simply the total number of radix blocks in the
// inputs
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
packed_num_radix_blocks);
num_comparisons = packed_num_radix_blocks;
} else {
// Packing is possible
if (carry_modulus >= message_modulus) {
// Compare (num_radix_blocks - 2) / 2 packed blocks
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
packed_num_radix_blocks);
// Compare the last block before the sign block separately
auto identity_lut = mem_ptr->identity_lut;
@@ -536,35 +510,32 @@ __host__ void host_integer_radix_difference_check_kb(
diff_buffer->tmp_packed_right +
packed_num_radix_blocks * big_lwe_size;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
stream, last_left_block_before_sign_block,
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
identity_lut);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
stream, last_right_block_before_sign_block,
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
identity_lut);
compare_radix_blocks_kb(
streams, gpu_indexes, gpu_count,
comparisons + packed_num_radix_blocks * big_lwe_size,
stream, comparisons + packed_num_radix_blocks * big_lwe_size,
last_left_block_before_sign_block, last_right_block_before_sign_block,
mem_ptr, bsk, ksk, 1);
// Compare the sign block separately
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count,
comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
stream, comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
mem_ptr->signed_lut);
num_comparisons = packed_num_radix_blocks + 2;
} else {
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
lwe_array_left, lwe_array_right, mem_ptr, bsk,
ksk, num_radix_blocks - 1);
compare_radix_blocks_kb(stream, comparisons, lwe_array_left,
lwe_array_right, mem_ptr, bsk, ksk,
num_radix_blocks - 1);
// Compare the sign block separately
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count,
comparisons + (num_radix_blocks - 1) * big_lwe_size,
stream, comparisons + (num_radix_blocks - 1) * big_lwe_size,
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
mem_ptr->signed_lut);
@@ -575,44 +546,39 @@ __host__ void host_integer_radix_difference_check_kb(
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
reduction_lut_f, bsk, ksk, num_comparisons);
tree_sign_reduction(stream, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
ksk, num_comparisons);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_comparison_check_kb(
cudaStream_t stream, uint32_t gpu_index,
int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, COMPARISON_TYPE op, bool is_signed,
bool allocate_gpu_memory) {
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool is_signed, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
*mem_ptr = new int_comparison_buffer<Torus>(stream, gpu_index, op, params,
num_radix_blocks, is_signed,
allocate_gpu_memory);
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_comparison_buffer<Torus>(
stream, op, params, num_radix_blocks, is_signed, allocate_gpu_memory);
}
template <typename Torus>
__host__ void
host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t total_num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
// Compute the sign
host_integer_radix_difference_check_kb(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsk,
ksk, total_num_radix_blocks);
stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks);
// Selector
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
total_num_radix_blocks);
host_integer_radix_cmux_kb(
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
}
#endif

View File

@@ -1,141 +0,0 @@
#include "integer/div_rem.cuh"
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
#ifdef BENCH_SCRATCH_LEVEL_1
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, static_cast<cudaStream_t>(stream));
#endif
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_div_rem_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
#ifdef BENCH_SCRATCH_LEVEL_1
cudaEventRecord(stop, static_cast<cudaStream_t>(stream));
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Time for scratch operations: %.3f ms\n", milliseconds);
#endif
}
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t num_blocks) {
auto stream_array = (cudaStream_t *)(streams);
auto cur_stream = stream_array[0];
#ifdef BENCH_HOST_LEVEL_1
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, cur_stream);
#endif
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
switch (mem->params.polynomial_size) {
case 512:
host_integer_div_rem_kb<uint64_t, Degree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
break;
case 1024:
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
break;
case 2048:
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
break;
case 4096:
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
break;
case 8192:
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
break;
case 16384:
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
break;
default:
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
}
#ifdef BENCH_HOST_LEVEL_1
cudaEventRecord(stop, cur_stream);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Time for host operations: %.3f ms\n", milliseconds);
#endif
}
void cleanup_cuda_integer_div_rem(void *stream, uint32_t gpu_index,
int8_t **mem_ptr_void) {
#ifdef BENCH_DROP_LEVEL_1
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, static_cast<cudaStream_t>(stream));
#endif
int_div_rem_memory<uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
#ifdef BENCH_DROP_LEVEL_1
cudaEventRecord(stop, static_cast<cudaStream_t>(stream));
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Time for drop operations: %.3f ms\n", milliseconds);
#endif
}

View File

@@ -1,965 +0,0 @@
#ifndef TFHE_RS_DIV_REM_CUH
#define TFHE_RS_DIV_REM_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <omp.h>
#include <sstream>
#include <string>
#include <vector>
#define BENCH_SCRATCH_LEVEL_1
#define BENCH_HOST_LEVEL_1
#define BENCH_DROP_LEVEL_1
#define BENCH_HOST_PHASE_0_LEVEL_2
#define BENCH_HOST_PHASE_1_LEVEL_3
#define BENCH_HOST_PHASE_2_LEVEL_3
#define BENCH_HOST_PHASE_3_LEVEL_3
#define BENCH_HOST_PHASE_4_LEVEL_2
#define BENCH_LEVEL_4
#define BENCH_OVERFLOW_SUM_LEVEL_2
float total_phase_1 = 0;
float total_phase_2 = 0;
float total_phase_3 = 0;
float total_phase_overflow = 0;
float total_phase_1_1 = 0;
float total_phase_1_2 = 0;
float total_phase_1_3 = 0;
float total_phase_1_4 = 0;
float total_phase_2_1 = 0;
float total_phase_2_2 = 0;
float total_phase_2_3 = 0;
float total_phase_2_4 = 0;
float total_phase_3_1 = 0;
float total_phase_3_2 = 0;
float total_phase_3_3 = 0;
float total_phase_3_4 = 0;
int ceil_div(int a, int b) { return (a + b - 1) / b; }
// struct makes it easier to use list of ciphertexts and move data between them
// struct does not allocate or drop any memory,
// keeps track on number of ciphertexts inside list.
template <typename Torus> struct lwe_ciphertext_list {
Torus *data;
size_t max_blocks;
size_t len;
int_radix_params params;
size_t big_lwe_size;
size_t radix_size;
size_t big_lwe_size_bytes;
size_t radix_size_bytes;
size_t big_lwe_dimension;
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
: data(src), params(params), max_blocks(max_blocks) {
big_lwe_size = params.big_lwe_dimension + 1;
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
radix_size = max_blocks * big_lwe_size;
radix_size_bytes = radix_size * sizeof(Torus);
big_lwe_dimension = params.big_lwe_dimension;
len = max_blocks;
}
// copies ciphertexts from Torus*, starting from `starting_block` including
// `finish_block`, does not change the value of self len
void copy_from(Torus *src, size_t start_block, size_t finish_block,
cudaStream_t stream, uint32_t gpu_index) {
size_t tmp_len = finish_block - start_block + 1;
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
tmp_len * big_lwe_size_bytes, stream,
gpu_index);
}
// copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
// including `finish_block`, does not change the value of self len
void copy_from(const lwe_ciphertext_list &src, size_t start_block,
size_t finish_block, cudaStream_t stream, uint32_t gpu_index) {
copy_from(src.data, start_block, finish_block, stream, gpu_index);
}
// copies ciphertexts from Torus*, starting from `starting_block`
// including `finish_block`, updating the value of self len
void clone_from(Torus *src, size_t start_block, size_t finish_block,
cudaStream_t stream, uint32_t gpu_index) {
len = finish_block - start_block + 1;
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
len * big_lwe_size_bytes, stream, gpu_index);
}
// copies ciphertexts from ciphertexts_list, starting from `starting_block`
// including `finish_block`, updating the value of self len
void clone_from(const lwe_ciphertext_list &src, size_t start_block,
size_t finish_block, cudaStream_t stream,
uint32_t gpu_index) {
clone_from(src.data, start_block, finish_block, stream, gpu_index);
}
// assign zero to blocks starting from `start_block` including `finish_block`
void assign_zero(size_t start_block, size_t finish_block, cudaStream_t stream,
uint32_t gpu_index) {
auto size = finish_block - start_block + 1;
cuda_memset_async(&data[start_block * big_lwe_size], 0,
size * big_lwe_size_bytes, stream, gpu_index);
}
// return pointer to last block
Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
// return pointer to first_block
Torus *first_block() { return data; }
// return block with `index`
Torus *get_block(size_t index) {
assert(index < len);
return &data[index * big_lwe_size];
}
bool is_empty() { return len == 0; }
// does not dop actual memory from `data`, only reduces value of `len` by one
void pop() {
if (len > 0)
len--;
else
assert(len > 0);
}
// insert ciphertext at index `ind`
void insert(size_t ind, Torus *ciphertext_block, cudaStream_t stream,
uint32_t gpu_index) {
assert(ind <= len);
assert(len < max_blocks);
size_t insert_offset = ind * big_lwe_size;
for (size_t i = len; i > ind; i--) {
Torus *src = &data[(i - 1) * big_lwe_size];
Torus *dst = &data[i * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream,
gpu_index);
}
cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
big_lwe_size_bytes, stream, gpu_index);
len++;
}
// push ciphertext at the end of `data`
void push(Torus *ciphertext_block, cudaStream_t stream, uint32_t gpu_index) {
assert(len < max_blocks);
size_t offset = len * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
big_lwe_size_bytes, stream, gpu_index);
len++;
}
// duplicate ciphertext into `number_of_blocks` ciphertexts
void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
cudaStream_t stream, uint32_t gpu_index) {
assert(number_of_blocks <= max_blocks);
for (size_t i = 0; i < number_of_blocks; i++) {
Torus *dest = &data[i * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes, stream,
gpu_index);
}
len = number_of_blocks;
}
// used for debugging, prints body of each ciphertext.
void print_blocks_body(const char *name) {
for (int i = 0; i < len; i++) {
print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
}
}
};
template <typename Torus>
__host__ void
scratch_cuda_integer_div_rem_kb(cudaStream_t stream, uint32_t gpu_index,
int_div_rem_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_div_rem_memory<Torus>(stream, gpu_index, params,
num_blocks, allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void
host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *quotient, Torus *remainder,
Torus *numerator, Torus *divisor, void *bsk,
uint64_t *ksk, int_div_rem_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {
total_phase_1 = 0;
total_phase_2 = 0;
total_phase_3 = 0;
total_phase_overflow = 0;
total_phase_1_1 = 0;
total_phase_1_2 = 0;
total_phase_1_3 = 0;
total_phase_1_4 = 0;
total_phase_2_1 = 0;
total_phase_2_2 = 0;
total_phase_2_3 = 0;
total_phase_3_1 = 0;
total_phase_3_2 = 0;
total_phase_3_3 = 0;
total_phase_3_4 = 0;
cudaEvent_t start, stop;
float milliseconds = 0;
#ifdef BENCH_HOST_PHASE_0_LEVEL_2
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
auto radix_params = mem_ptr->params;
auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
uint32_t message_modulus = radix_params.message_modulus;
uint32_t carry_modulus = radix_params.carry_modulus;
uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
uint32_t total_bits = num_bits_in_message * num_blocks;
// put temporary buffers in lwe_ciphertext_list for easy use
lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> numerator_block_stack(
mem_ptr->numerator_block_stack, radix_params, num_blocks);
lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
radix_params, 1);
lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
num_blocks + 1);
lwe_ciphertext_list<Torus> interesting_remainder1(
mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
lwe_ciphertext_list<Torus> interesting_remainder2(
mem_ptr->interesting_remainder2, radix_params, num_blocks);
lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> subtraction_overflowed(
mem_ptr->subtraction_overflowed, radix_params, 1);
lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
radix_params, 1);
lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
1);
lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
gpu_indexes[0]);
remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
#ifdef BENCH_HOST_PHASE_0_LEVEL_2
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("--Time for phase_0 operations: %.3f ms\n", milliseconds);
#endif
for (int i = total_bits - 1; i >= 0; i--) {
#ifdef BENCH_HOST_PHASE_1_LEVEL_3
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
uint32_t block_of_bit = i / num_bits_in_message;
uint32_t pos_in_block = i % num_bits_in_message;
uint32_t msb_bit_set = total_bits - 1 - i;
uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
// Index to the first block of the remainder that is fully trivial 0
// and all blocks after it are also trivial zeros
// This number is in range 1..=num_bocks -1
uint32_t first_trivial_block = last_non_trivial_block + 1;
interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
divisor_ms_blocks.clone_from(divisor,
(msb_bit_set + 1) / num_bits_in_message,
num_blocks - 1, streams[0], gpu_indexes[0]);
// We split the divisor at a block position, when in reality the split
// should be at a bit position meaning that potentially (depending on
// msb_bit_set) the split versions share some bits they should not. So we do
// one PBS on the last block of the interesting_divisor, and first block of
// divisor_ms_blocks to trim out bits which should not be there
auto trim_last_interesting_divisor_bits =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count)
{
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
return;
}
// The last block of the interesting part of the remainder
// can contain bits which we should not account for
// we have to zero them out.
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to keep only first bit (right
// shift
// mask by 1) if pos_in_block is 1, then we want to keep the two
// bits
// (right shift mask by 0)
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
// Create mask of 1s on the message part, 0s in the carries
uint32_t full_message_mask = message_modulus - 1;
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsk, ksk, 1,
mem_ptr->masking_luts_1[shifted_mask]);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_1_1 += milliseconds;
//printf("Time for scratch operations: %.3f ms\n", milliseconds);
#endif
}; // trim_last_interesting_divisor_bits
auto trim_first_divisor_ms_bits =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
if (divisor_ms_blocks.is_empty() ||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
return;
}
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to discard the first bit (left
// shift mask by 1) if pos_in_block is 1, then we want to discard the
// two bits (left shift mask by 2) let shift_amount =
// num_bits_in_message - pos_in_block
uint32_t shift_amount = pos_in_block + 1;
uint32_t full_message_mask = message_modulus - 1;
uint32_t shifted_mask = full_message_mask << shift_amount;
// Keep the mask within the range of message bits, so that
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsk, ksk, 1,
mem_ptr->masking_luts_2[shifted_mask]);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_1_2 += milliseconds;
#endif
}; // trim_first_divisor_ms_bits
// This does
// R := R << 1; R(0) := N(i)
//
// We could to that by left shifting, R by one, then unchecked_add the
// correct numerator bit.
//
// However, to keep the remainder clean (noise wise), what we do is that we
// put the remainder block from which we need to extract the bit, as the LSB
// of the Remainder, so that left shifting will pull the bit we need.
auto left_shift_interesting_remainder1 =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
numerator_block_1.clone_from(
numerator_block_stack, numerator_block_stack.len - 1,
numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
numerator_block_stack.pop();
interesting_remainder1.insert(0, numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
mem_ptr->shift_mem_1, bsk, ksk, interesting_remainder1.len);
tmp_radix.clone_from(interesting_remainder1, 0,
interesting_remainder1.len - 1, streams[0],
gpu_indexes[0]);
radix_blocks_rotate_left<<<interesting_remainder1.len, 256, 0,
streams[0]>>>(
interesting_remainder1.data, tmp_radix.data, 1,
interesting_remainder1.len, big_lwe_size);
numerator_block_1.clone_from(
interesting_remainder1, interesting_remainder1.len - 1,
interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
interesting_remainder1.pop();
if (pos_in_block != 0) {
// We have not yet extracted all the bits from this numerator
// so, we put it back on the front so that it gets taken next
// iteration
numerator_block_stack.push(numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
}
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_1_3 += milliseconds;
#endif
}; // left_shift_interesting_remainder1
auto left_shift_interesting_remainder2 =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
mem_ptr->shift_mem_2, bsk, ksk, interesting_remainder2.len);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_1_4 += milliseconds;
#endif
}; // left_shift_interesting_remainder2
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
#pragma omp parallel sections
{
#pragma omp section
{
// interesting_divisor
trim_last_interesting_divisor_bits(&mem_ptr->sub_stream_1,
&gpu_indexes[0], 1);
}
#pragma omp section
{
// divisor_ms_blocks
trim_first_divisor_ms_bits(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
}
#pragma omp section
{
// interesting_remainder1
// numerator_block_stack
left_shift_interesting_remainder1(&mem_ptr->sub_stream_3,
&gpu_indexes[0], 1);
}
#pragma omp section
{
// interesting_remainder2
left_shift_interesting_remainder2(&mem_ptr->sub_stream_4,
&gpu_indexes[0], 1);
}
}
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_4, gpu_indexes[0]);
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
// In practice interesting_remainder1 contains the numerator bit,
// but in that position, interesting_remainder2 always has a 0
auto &merged_interesting_remainder = interesting_remainder1;
host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
merged_interesting_remainder.data,
interesting_remainder2.data, radix_params.big_lwe_dimension,
merged_interesting_remainder.len);
// after create_clean_version_of_merged_remainder
// `merged_interesting_remainder` will be reused as
// `cleaned_merged_interesting_remainder`
cleaned_merged_interesting_remainder.clone_from(
merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
streams[0], gpu_indexes[0]);
assert(merged_interesting_remainder.len == interesting_divisor.len);
// `new_remainder` is not initialized yet, so need to set length
new_remainder.len = merged_interesting_remainder.len;
#ifdef BENCH_HOST_PHASE_1_LEVEL_3
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_1 += milliseconds;
//printf("----Time for phase_1 operations: %.3f ms\n", milliseconds);
#endif
#ifdef BENCH_HOST_PHASE_2_LEVEL_3
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
// fills:
// `new_remainder` - radix ciphertext
// `subtraction_overflowed` - single ciphertext
auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
host_integer_overflowing_sub_kb<Torus, params>(
streams, gpu_indexes, gpu_count, new_remainder.data,
subtraction_overflowed.data, merged_interesting_remainder.data,
interesting_divisor.data, bsk, ksk, mem_ptr->overflow_sub_mem,
merged_interesting_remainder.len);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_2_1 += milliseconds;
#endif
};
// fills:
// `at_least_one_upper_block_is_non_zero` - single ciphertext
auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
uint32_t *gpu_indexes,
uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
auto &trivial_blocks = divisor_ms_blocks;
if (trivial_blocks.is_empty()) {
cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
} else {
// We could call unchecked_scalar_ne
// But we are in the special case where scalar == 0
// So we can skip some stuff
host_compare_with_zero_equality(
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
mem_ptr->comparison_buffer, bsk, ksk, trivial_blocks.len,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
tmp_1.len =
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
is_at_least_one_comparisons_block_true(
streams, gpu_indexes, gpu_count,
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len);
}
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_2_2 += milliseconds;
#endif
};
// Creates a cleaned version (noise wise) of the merged remainder
// so that it can be safely used in bivariate PBSes
// fills:
// `cleaned_merged_interesting_remainder` - radix ciphertext
auto create_clean_version_of_merged_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data, bsk, ksk,
cleaned_merged_interesting_remainder.len,
mem_ptr->message_extract_lut_1);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_2_3 += milliseconds;
#endif
};
// phase 2
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
#pragma omp parallel sections
{
#pragma omp section
{
// new_remainder
// subtraction_overflowed
do_overflowing_sub(&mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
}
#pragma omp section
{
// at_least_one_upper_block_is_non_zero
check_divisor_upper_blocks(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
}
#pragma omp section
{
// cleaned_merged_interesting_remainder
create_clean_version_of_merged_remainder(&mem_ptr->sub_stream_3,
&gpu_indexes[0], 1);
}
}
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
#ifdef BENCH_HOST_PHASE_2_LEVEL_3
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_2 += milliseconds;
// printf("----Time for phase_2 operations: %.3f ms\n", milliseconds);
#endif
#ifdef BENCH_OVERFLOW_SUM_LEVEL_2
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data,
radix_params.big_lwe_dimension, 1);
int factor = (i) ? 3 : 2;
int factor_lut_id = factor - 2;
overflow_sum_radix.fill_with_same_ciphertext(
overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
streams[0], gpu_indexes[0]);
#ifdef BENCH_OVERFLOW_SUM_LEVEL_2
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_overflow += milliseconds;
// printf("----Time for phase_2 operations: %.3f ms\n", milliseconds);
#endif
#ifdef BENCH_HOST_PHASE_3_LEVEL_3
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
auto conditionally_zero_out_merged_interesting_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data,
overflow_sum_radix.data, bsk, ksk,
cleaned_merged_interesting_remainder.len,
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
factor);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_3_1 += milliseconds;
#endif
};
auto conditionally_zero_out_merged_new_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
streams, gpu_indexes, gpu_count, new_remainder.data,
new_remainder.data, overflow_sum_radix.data, bsk, ksk,
new_remainder.len,
mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_3_2 += milliseconds;
#endif
};
auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
#ifdef BENCH_LEVEL_4
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, did_not_overflow.data,
subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1,
mem_ptr->merge_overflow_flags_luts[pos_in_block]);
host_addition(streams[0], gpu_indexes[0],
&quotient[block_of_bit * big_lwe_size],
&quotient[block_of_bit * big_lwe_size],
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
#ifdef BENCH_LEVEL_4
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_3_3 += milliseconds;
#endif
};
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
#pragma omp parallel sections
{
#pragma omp section
{
// cleaned_merged_interesting_remainder
conditionally_zero_out_merged_interesting_remainder(
&mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
}
#pragma omp section
{
// new_remainder
conditionally_zero_out_merged_new_remainder(&mem_ptr->sub_stream_2,
&gpu_indexes[0], 1);
}
#pragma omp section
{
// quotient
set_quotient_bit(&mem_ptr->sub_stream_3, &gpu_indexes[0], 1);
}
}
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
assert(first_trivial_block - 1 == new_remainder.len);
remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
first_trivial_block - 1, streams[0], gpu_indexes[0]);
remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, streams[0],
gpu_indexes[0]);
#ifdef BENCH_HOST_PHASE_3_LEVEL_3
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_phase_3 += milliseconds;
//printf("----Time for phase_3 operations: %.3f ms\n", milliseconds);
#endif
}
printf("----phase_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_1,
total_phase_1 / total_bits);
{
printf("------phase_1_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_1,
total_phase_1_1 / total_bits);
printf("------phase_1_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_2,
total_phase_1_2 / total_bits);
printf("------phase_1_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_3,
total_phase_1_3 / total_bits);
printf("------phase_1_4 total = %.3f ms; avg: = %.3f ms\n", total_phase_1_4,
total_phase_1_4 / total_bits);
}
printf("----phase_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_2,
total_phase_2 / total_bits);
{
printf("------phase_2_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_2_1,
total_phase_2_1 / total_bits);
printf("------phase_2_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_2_2,
total_phase_2_2 / total_bits);
printf("------phase_2_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_2_3,
total_phase_2_3 / total_bits);
}
// printf("----phase_overflow_sum total = %.3f ms; avg: = %.3f ms\n",
// total_phase_overflow,
// total_phase_overflow / total_bits);
printf("----phase_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_3,
total_phase_3 / total_bits);
{
printf("------phase_3_1 total = %.3f ms; avg: = %.3f ms\n", total_phase_3_1,
total_phase_3_1 / total_bits);
printf("------phase_3_2 total = %.3f ms; avg: = %.3f ms\n", total_phase_3_2,
total_phase_3_2 / total_bits);
printf("------phase_3_3 total = %.3f ms; avg: = %.3f ms\n", total_phase_3_3,
total_phase_3_3 / total_bits);
}
#ifdef BENCH_HOST_PHASE_4_LEVEL_2
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start time
cudaEventRecord(start, streams[0]);
#endif
assert(remainder1.len == remainder2.len);
// Clean the quotient and remainder
// as even though they have no carries, they are not at nominal noise level
host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
remainder2.data, radix_params.big_lwe_dimension,
remainder1.len);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
#pragma omp parallel sections
{
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
&mem_ptr->sub_stream_1, &gpu_indexes[0], 1, remainder, remainder, bsk,
ksk, num_blocks, mem_ptr->message_extract_lut_1);
}
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
&mem_ptr->sub_stream_2, &gpu_indexes[0], 1, quotient, quotient, bsk,
ksk, num_blocks, mem_ptr->message_extract_lut_2);
}
}
cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
#ifdef BENCH_HOST_PHASE_4_LEVEL_2
cudaEventRecord(stop, streams[0]);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("--Time for phase_4 operations: %.3f ms\n", milliseconds);
#endif
}
#endif // TFHE_RS_DIV_REM_CUH

View File

@@ -2,65 +2,58 @@
#include <linear_algebra.h>
void cuda_full_propagation_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *input_blocks, int8_t *mem_ptr, void *ksk, void *bsk,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_base_log, uint32_t ks_level, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_blocks) {
cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
uint32_t num_blocks) {
switch (polynomial_size) {
case 256:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks),
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 512:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks),
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 1024:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks),
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 2048:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks),
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 4096:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks),
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 8192:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks),
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 16384:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks),
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
@@ -73,42 +66,41 @@ void cuda_full_propagation_64_inplace(
}
void scratch_cuda_full_propagation_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t lwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
scratch_cuda_full_propagation<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension, glwe_dimension,
polynomial_size, level_count, grouping_factor, input_lwe_ciphertext_count,
message_modulus, carry_modulus, pbs_type, allocate_gpu_memory);
stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
allocate_gpu_memory);
}
void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index,
void cleanup_cuda_full_propagation(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_fullprop_buffer<uint64_t> *mem_ptr =
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
auto s = static_cast<cudaStream_t>(stream);
cuda_drop_async(mem_ptr->lut_buffer, s, gpu_index);
cuda_drop_async(mem_ptr->lut_indexes, s, gpu_index);
cuda_drop_async(mem_ptr->lut_buffer, stream);
cuda_drop_async(mem_ptr->lut_indexes, stream);
cuda_drop_async(mem_ptr->lwe_indexes, s, gpu_index);
cuda_drop_async(mem_ptr->lwe_indexes, stream);
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, s, gpu_index);
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, s, gpu_index);
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
switch (mem_ptr->pbs_type) {
case CLASSICAL: {
auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(mem_ptr->pbs_buffer);
x->release(s, gpu_index);
x->release(stream);
} break;
case MULTI_BIT: {
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
x->release(s, gpu_index);
x->release(stream);
} break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
@@ -116,7 +108,7 @@ void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index,
}
void scratch_cuda_propagate_single_carry_kb_64_inplace(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -129,63 +121,23 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
message_modulus, carry_modulus);
scratch_cuda_propagate_single_carry_kb_inplace(
static_cast<cudaStream_t>(stream), gpu_index,
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}
void cuda_propagate_single_carry_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
void cuda_propagate_single_carry_kb_64_inplace(cuda_stream_t *stream,
void *lwe_array, int8_t *mem_ptr,
void *bsk, void *ksk,
uint32_t num_blocks) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array),
stream, static_cast<uint64_t *>(lwe_array),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
static_cast<uint64_t *>(ksk), num_blocks);
}
void cleanup_cuda_propagate_single_carry(void *stream, uint32_t gpu_index,
void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
}
void scratch_cuda_apply_univariate_lut_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, void *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
num_radix_blocks, params, allocate_gpu_memory);
}
void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void *input_radix_lwe, int8_t *mem_ptr,
void *ksk, void *bsk,
uint32_t num_blocks) {
host_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk), bsk,
num_blocks);
}
void cleanup_cuda_apply_univariate_lut_kb_64(void *stream, uint32_t gpu_index,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -82,20 +82,18 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
* becomes out = m1 * shift + m2
*/
template <typename Torus>
__host__ void pack_bivariate_blocks(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out,
__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_indexes_out, Torus *lwe_array_1,
Torus *lwe_array_2, Torus *lwe_indexes_in,
uint32_t lwe_dimension, uint32_t shift,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
// Left message is shifted
int num_blocks = 0, num_threads = 0;
int num_entries = num_radix_blocks * (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
lwe_dimension, shift, num_radix_blocks);
check_cuda_error(cudaGetLastError());
@@ -103,9 +101,9 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
template <typename Torus>
__host__ void integer_radix_apply_univariate_lookup_table_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, void *bsk, Torus *ksk,
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
cudaSetDevice(stream->gpu_index);
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
@@ -121,26 +119,24 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
// Compute Keyswitch-PBS
cuda_keyswitch_lwe_ciphertext_vector(
streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, lwe_array_in, lut->lwe_indexes_in, ksk,
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_in,
lut->lwe_indexes_in, ksk, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
lut->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
}
template <typename Torus>
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
cudaSetDevice(gpu_indexes[0]);
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
int_radix_lut<Torus> *lut) {
cudaSetDevice(stream->gpu_index);
// apply_lookup_table_bivariate
auto params = lut->params;
auto pbs_type = params.pbs_type;
@@ -157,71 +153,23 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
// Left message is shifted
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
lut->lwe_indexes_in, big_lwe_dimension, message_modulus,
num_radix_blocks);
pack_bivariate_blocks(stream, lwe_array_pbs_in, lut->lwe_trivial_indexes,
lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
big_lwe_dimension, message_modulus, num_radix_blocks);
check_cuda_error(cudaGetLastError());
// Apply LUT
cuda_keyswitch_lwe_ciphertext_vector(
streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, lwe_array_pbs_in, lut->lwe_trivial_indexes, ksk,
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_pbs_in,
lut->lwe_trivial_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
lut->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
}
template <typename Torus>
__host__ void integer_radix_apply_bivariate_lookup_table_kb_factor(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
uint32_t shift) {
cudaSetDevice(gpu_indexes[0]);
// apply_lookup_table_bivariate
auto params = lut->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
auto message_modulus = params.message_modulus;
// Left message is shifted
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
lut->lwe_indexes_in, big_lwe_dimension, shift,
num_radix_blocks);
check_cuda_error(cudaGetLastError());
// Apply LUT
cuda_keyswitch_lwe_ciphertext_vector(
streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, lwe_array_pbs_in, lut->lwe_trivial_indexes, ksk,
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
lut->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
}
// Rotates the slice in-place such that the first mid elements of the slice move
@@ -287,38 +235,19 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
message_modulus, carry_modulus, wrapped_f);
}
template <typename Torus>
void generate_lookup_table_bivariate_with_factor(
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f, int factor) {
Torus factor_u64 = factor;
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
Torus lhs = (input / factor_u64) % message_modulus;
Torus rhs = (input % factor_u64) % message_modulus;
return f(lhs, rhs);
};
generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, wrapped_f);
}
/*
* generate bivariate accumulator for device pointer
* stream - cuda stream
* v_stream - cuda stream
* acc - device pointer for bivariate accumulator
* ...
* f - wrapping function with two Torus inputs
*/
template <typename Torus>
void generate_device_accumulator_bivariate(
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f) {
cudaSetDevice(gpu_index);
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -328,66 +257,29 @@ void generate_device_accumulator_bivariate(
message_modulus, carry_modulus, f);
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(Torus),
stream, gpu_index);
cuda_memcpy_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
}
/*
* generate bivariate accumulator with factor scaling for device pointer
* v_stream - cuda stream
* acc - device pointer for bivariate accumulator
* ...
* f - wrapping function with two Torus inputs
*/
template <typename Torus>
void generate_device_accumulator_bivariate_with_factor(
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
cudaSetDevice(gpu_index);
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
// fill bivariate accumulator
generate_lookup_table_bivariate_with_factor<Torus>(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
factor);
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(Torus),
stream, gpu_index);
// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
}
/*
* generate accumulator for device pointer
* generate bivariate accumulator for device pointer
* v_stream - cuda stream
* acc - device pointer for accumulator
* ...
* f - evaluating function with one Torus input
*/
template <typename Torus>
void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
Torus *acc, uint32_t glwe_dimension,
void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus)> f) {
cudaSetDevice(gpu_index);
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -399,27 +291,24 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index);
stream);
// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
}
template <typename Torus>
void scratch_cuda_propagate_single_carry_kb_inplace(
cudaStream_t stream, uint32_t gpu_index,
int_sc_prop_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
*mem_ptr = new int_sc_prop_memory<Torus>(
stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory);
*mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
allocate_gpu_memory);
}
template <typename Torus>
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array,
void host_propagate_single_carry(cuda_stream_t *stream, Torus *lwe_array,
int_sc_prop_memory<Torus> *mem, void *bsk,
Torus *ksk, uint32_t num_blocks) {
auto params = mem->params;
@@ -436,17 +325,15 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
auto message_acc = mem->message_acc;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk,
ksk, num_blocks, luts_array);
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
luts_array);
// compute prefix sum with hillis&steele
int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cudaSetDevice(gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
big_lwe_size_bytes * num_blocks, stream);
for (int step = 0; step < num_steps; step++) {
auto cur_blocks = &step_output[space * big_lwe_size];
@@ -454,38 +341,32 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
int cur_total_blocks = num_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
bsk, ksk, cur_total_blocks, luts_carry_propagation_sum);
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
luts_carry_propagation_sum);
cudaSetDevice(gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
&generates_or_propagates[space * big_lwe_size], cur_blocks,
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
cur_blocks,
big_lwe_size_bytes * cur_total_blocks, stream);
space *= 2;
}
cudaSetDevice(gpu_indexes[0]);
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
host_addition(stream, lwe_array, lwe_array, step_output,
glwe_dimension * polynomial_size, num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk,
num_blocks, message_acc);
stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
}
template <typename Torus>
void host_propagate_single_sub_borrow(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *overflowed, Torus *lwe_array,
void host_propagate_single_sub_borrow(cuda_stream_t *stream, Torus *overflowed,
Torus *lwe_array,
int_single_borrow_prop_memory<Torus> *mem,
void *bsk, Torus *ksk,
uint32_t num_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -500,15 +381,14 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
auto message_acc = mem->message_acc;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk,
ksk, num_blocks, luts_array);
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
luts_array);
// compute prefix sum with hillis&steele
int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
big_lwe_size_bytes * num_blocks, stream);
for (int step = 0; step < num_steps; step++) {
auto cur_blocks = &step_output[space * big_lwe_size];
@@ -516,30 +396,28 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
int cur_total_blocks = num_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
bsk, ksk, cur_total_blocks, luts_carry_propagation_sum);
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
luts_carry_propagation_sum);
cuda_memcpy_async_gpu_to_gpu(
&generates_or_propagates[space * big_lwe_size], cur_blocks,
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
cur_blocks,
big_lwe_size_bytes * cur_total_blocks, stream);
space *= 2;
}
cuda_memcpy_async_gpu_to_gpu(
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
big_lwe_size_bytes, stream);
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
step_output, glwe_dimension * polynomial_size, num_blocks);
host_subtraction(stream, lwe_array, lwe_array, step_output,
glwe_dimension * polynomial_size, num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk,
num_blocks, message_acc);
stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
}
/*
@@ -551,8 +429,7 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
*/
template <typename Torus, typename STorus, class params>
void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *input_blocks,
void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
int_fullprop_buffer<Torus> *mem_ptr,
Torus *ksk, void *bsk, uint32_t lwe_dimension,
uint32_t glwe_dimension,
@@ -561,7 +438,6 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t pbs_level, uint32_t grouping_factor,
uint32_t num_blocks) {
cudaSetDevice(gpu_indexes[0]);
int big_lwe_size = (glwe_dimension * polynomial_size + 1);
int small_lwe_size = (lwe_dimension + 1);
@@ -569,32 +445,29 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
auto cur_input_block = &input_blocks[i * big_lwe_size];
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
mem_ptr->lwe_indexes, cur_input_block, mem_ptr->lwe_indexes, ksk,
stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
cur_input_block, mem_ptr->lwe_indexes, ksk,
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
1);
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
mem_ptr->tmp_small_lwe_vector,
small_lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
small_lwe_size * sizeof(Torus), stream);
execute_pbs<Torus>(
streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
mem_ptr->lwe_indexes, mem_ptr->lut_buffer, mem_ptr->lut_indexes,
stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
mem_ptr->lut_buffer, mem_ptr->lut_indexes,
mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), mem_ptr->pbs_type);
cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
big_lwe_size * sizeof(Torus), stream);
if (i < num_blocks - 1) {
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
host_addition(streams[0], gpu_indexes[0], next_input_block,
next_input_block,
host_addition(stream, next_input_block, next_input_block,
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
glwe_dimension * polynomial_size, 1);
}
@@ -603,18 +476,18 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
void scratch_cuda_full_propagation(
cudaStream_t stream, uint32_t gpu_index,
int_fullprop_buffer<Torus> **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t pbs_level,
uint32_t grouping_factor, uint32_t num_radix_blocks,
cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int8_t *pbs_buffer;
execute_scratch_pbs<Torus>(
stream, gpu_index, &pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, pbs_level, grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(gpu_index), pbs_type, allocate_gpu_memory);
execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, pbs_level, grouping_factor,
num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index),
pbs_type, allocate_gpu_memory);
// LUT
Torus *lut_buffer;
@@ -624,7 +497,7 @@ void scratch_cuda_full_propagation(
Torus lut_buffer_size =
2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream, gpu_index);
lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
// LUTs
auto lut_f_message = [message_modulus](Torus x) -> Torus {
@@ -640,36 +513,34 @@ void scratch_cuda_full_propagation(
lut_buffer + (glwe_dimension + 1) * polynomial_size;
generate_device_accumulator<Torus>(
stream, gpu_index, lut_buffer_message, glwe_dimension, polynomial_size,
stream, lut_buffer_message, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(
stream, gpu_index, lut_buffer_carry, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry);
generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, lut_f_carry);
}
Torus *lut_indexes;
if (allocate_gpu_memory) {
lut_indexes =
(Torus *)cuda_malloc_async(2 * sizeof(Torus), stream, gpu_index);
lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
Torus h_lut_indexes[2] = {0, 1};
cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
stream, gpu_index);
stream);
}
Torus *lwe_indexes;
if (allocate_gpu_memory) {
Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
lwe_indexes =
(Torus *)cuda_malloc_async(lwe_indexes_size, stream, gpu_index);
lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
stream, gpu_index);
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
stream);
cuda_stream_add_callback(stream, host_free_on_stream_callback,
h_lwe_indexes);
}
@@ -681,10 +552,8 @@ void scratch_cuda_full_propagation(
Torus big_vector_size =
2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
small_lwe_vector =
(Torus *)cuda_malloc_async(small_vector_size, stream, gpu_index);
big_lwe_vector =
(Torus *)cuda_malloc_async(big_vector_size, stream, gpu_index);
small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
}
*mem_ptr = new int_fullprop_buffer<Torus>;
@@ -738,16 +607,19 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
//
// Expects the carry buffer to be empty
template <typename Torus>
__host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *lwe_array_in,
uint32_t lwe_dimension, uint32_t num_radix_blocks,
uint32_t factor) {
cudaSetDevice(gpu_index);
__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t factor) {
if (lwe_array_out == lwe_array_in)
PANIC("Cuda error in pack blocks: input and output pointers must be "
"different.");
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
}
@@ -767,16 +639,14 @@ device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
template <typename Torus>
__host__ void
create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *scalar_array,
uint32_t lwe_dimension, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks, uint64_t message_modulus,
uint64_t carry_modulus) {
create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *scalar_array, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
uint64_t message_modulus, uint64_t carry_modulus) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream,
gpu_index);
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
if (num_scalar_blocks == 0)
return;
@@ -793,7 +663,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_create_trivial_radix<<<grid, thds, 0, stream>>>(
device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
}
@@ -804,26 +674,23 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
* * (lwe_dimension+1) * sizeeof(Torus) bytes
*/
template <typename Torus>
__host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
__host__ void extract_n_bits(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in, void *bsk, Torus *ksk,
uint32_t num_radix_blocks, uint32_t bits_per_block,
int_bit_extract_luts_buffer<Torus> *bit_extract) {
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk,
stream, lwe_array_out, lwe_array_in, bsk, ksk,
num_radix_blocks * bits_per_block, bit_extract->lut);
}
template <typename Torus>
__host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *signs_array_out,
__host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
Torus *signs_array_in,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f,
void *bsk, Torus *ksk, uint32_t num_sign_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto diff_buffer = mem_ptr->diff_buffer;
auto params = mem_ptr->params;
@@ -844,22 +711,20 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
auto signs_a = diff_buffer->tmp_signs_a;
auto signs_b = diff_buffer->tmp_signs_b;
cuda_memcpy_async_gpu_to_gpu(signs_a, signs_array_in,
(big_lwe_dimension + 1) * num_sign_blocks *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
signs_a, signs_array_in,
(big_lwe_dimension + 1) * num_sign_blocks * sizeof(Torus), stream);
if (num_sign_blocks > 2) {
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, reduce_two_orderings_function);
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, reduce_two_orderings_function);
while (num_sign_blocks > 2) {
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
big_lwe_dimension, num_sign_blocks, 4);
pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, num_sign_blocks,
4);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsk, ksk,
num_sign_blocks / 2, lut);
stream, signs_a, signs_b, bsk, ksk, num_sign_blocks / 2, lut);
auto last_block_signs_b =
signs_b + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
@@ -868,7 +733,7 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
if (num_sign_blocks % 2 == 1)
cuda_memcpy_async_gpu_to_gpu(last_block_signs_a, last_block_signs_b,
(big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
stream);
num_sign_blocks = (num_sign_blocks / 2) + (num_sign_blocks % 2);
}
@@ -882,14 +747,12 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
};
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f);
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, final_lut_f);
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
2, 4);
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
gpu_count, signs_array_out,
pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, 2, 4);
integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
signs_b, bsk, ksk, 1, lut);
} else {
@@ -900,41 +763,12 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
};
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f);
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, final_lut_f);
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
gpu_count, signs_array_out,
integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
signs_a, bsk, ksk, 1, lut);
}
}
template <typename Torus>
void scratch_cuda_apply_univariate_lut_kb(
cudaStream_t stream, uint32_t gpu_index, int_radix_lut<Torus> **mem_ptr,
Torus *input_lut, uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_radix_lut<Torus>(stream, gpu_index, params, 1,
num_radix_blocks, allocate_gpu_memory);
cuda_memcpy_async_to_gpu((*mem_ptr)->lut, input_lut,
(params.glwe_dimension + 1) *
params.polynomial_size * sizeof(Torus),
stream, gpu_index);
}
template <typename Torus>
void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *radix_lwe_out,
Torus *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus *ksk,
void *bsk, uint32_t num_blocks) {
cudaSetDevice(gpu_indexes[0]);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsk, ksk,
num_blocks, mem);
}
#endif // TFHE_RS_INTERNAL_INTEGER_CUH

View File

@@ -66,12 +66,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
* the integer radix multiplication in keyswitch->bootstrap order.
*/
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
polynomial_size * glwe_dimension, lwe_dimension,
@@ -79,21 +79,14 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
grouping_factor, message_modulus, carry_modulus);
switch (polynomial_size) {
case 256:
case 512:
case 1024:
case 2048:
case 4096:
case 8192:
case 16384:
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
allocate_gpu_memory);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Supported N's are powers of two in the interval [256..16384].")
"Only N = 2048 is supported")
}
}
@@ -126,69 +119,18 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
* - 'max_shared_memory' maximum shared memory per cuda block
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, void *bsk,
void *ksk, int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
@@ -196,21 +138,20 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Supported N's are powers of two in the interval [256..16384].")
"Only N = 2048 is supported")
}
}
void cleanup_cuda_integer_mult(void *stream, uint32_t gpu_index,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
int_mul_memory<uint64_t> *mem_ptr =
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}
void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks_in_radix,
@@ -222,15 +163,14 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
max_num_radix_in_vec, params, allocate_gpu_memory);
stream, (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr,
num_blocks_in_radix, max_num_radix_in_vec, params, allocate_gpu_memory);
}
void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks_in_radix) {
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_vec,
uint32_t num_radix_in_vec, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_blocks_in_radix) {
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
@@ -244,65 +184,58 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
switch (mem->params.polynomial_size) {
case 512:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
num_radix_in_vec);
break;
case 1024:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
num_radix_in_vec);
break;
case 2048:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
num_radix_in_vec);
break;
case 4096:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
num_radix_in_vec);
break;
case 8192:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
num_radix_in_vec);
break;
case 16384:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
num_radix_in_vec);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Supported N's are powers of two in the interval [256..16384].")
PANIC("Cuda error (integer sum ciphertexts): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
}
free(terms_degree);
}
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void *stream,
uint32_t gpu_index,
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -91,15 +91,12 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
}
}
template <typename Torus, sharedMemDegree SMD>
template <typename Torus>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t block_size,
uint32_t num_blocks) {
extern __shared__ int8_t sharedmem[];
Torus *result = (Torus *)sharedmem;
extern __shared__ Torus result[];
size_t stride = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
@@ -109,9 +106,6 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
size_t block_stride = blockIdx.y * block_size;
auto dst_block = &dst_radix[block_stride];
if constexpr (SMD == NOSM)
result = dst_block;
// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
for (int i = tid; i < block_size; i += stride) {
@@ -127,9 +121,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
}
// put result from shared mem to global mem
if constexpr (SMD == FULLSM)
for (int i = tid; i < block_size; i += stride)
dst_block[i] = result[i];
for (int i = tid; i < block_size; i += stride) {
dst_block[i] = result[i];
}
}
template <typename Torus, class params>
@@ -181,44 +175,39 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
}
template <typename Torus>
__host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
cudaStream_t stream, uint32_t gpu_index,
int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
cuda_stream_t *stream, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
int_radix_params params, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
stream, gpu_index, params, num_blocks_in_radix, max_num_radix_in_vec,
stream, params, num_blocks_in_radix, max_num_radix_in_vec,
allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void host_integer_sum_ciphertexts_vec_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void *bsk,
uint64_t *ksk, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
cuda_stream_t *stream, Torus *radix_lwe_out, Torus *terms,
int *terms_degree, void *bsk, uint64_t *ksk,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto new_blocks = mem_ptr->new_blocks;
auto old_blocks = mem_ptr->old_blocks;
auto small_lwe_vector = mem_ptr->small_lwe_vector;
auto luts_message_carry = mem_ptr->luts_message_carry;
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
@@ -235,7 +224,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
num_blocks_in_radix * num_radix_in_vec *
big_lwe_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
stream);
}
size_t r = num_radix_in_vec;
@@ -248,7 +237,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
int32_t h_smart_copy_in[r * num_blocks];
int32_t h_smart_copy_out[r * num_blocks];
auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
while (r > 2) {
size_t cur_total_blocks = r * num_blocks;
@@ -258,14 +247,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
dim3 add_grid(ch_amount, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
if (sm_size < max_shared_memory)
tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
else
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, streams[0]>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
check_cuda_error(cudaGetLastError());
tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
size_t total_count = 0;
size_t message_count = 0;
@@ -277,71 +260,36 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
total_count, message_count, carry_count, sm_copy_count);
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
auto luts_message_carry = new int_radix_lut<Torus>(
streams[0], gpu_indexes[0], mem_ptr->params, 2, total_count, true);
auto message_acc = luts_message_carry->get_lut(0);
auto carry_acc = luts_message_carry->get_lut(1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
size_t copy_size = total_count * sizeof(Torus);
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, stream);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, stream);
copy_size = sm_copy_count * sizeof(int32_t);
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
streams[0], gpu_indexes[0]);
stream);
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
streams[0], gpu_indexes[0]);
stream);
smart_copy<<<sm_copy_count, 256, 0, streams[0]>>>(
smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
big_lwe_size);
check_cuda_error(cudaGetLastError());
if (carry_count > 0)
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_message_carry->get_lut_indexes(message_count), 1, carry_count);
&(stream->stream), luts_message_carry->get_lut_indexes(message_count),
1, carry_count);
cuda_keyswitch_lwe_ciphertext_vector(
streams[0], gpu_indexes[0], small_lwe_vector, lwe_indexes_in,
new_blocks, lwe_indexes_in, ksk, polynomial_size * glwe_dimension,
lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
message_count);
stream, small_lwe_vector, lwe_indexes_in, new_blocks, lwe_indexes_in,
ksk, polynomial_size * glwe_dimension, lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
lwe_indexes_out, luts_message_carry->lut,
luts_message_carry->lut_indexes, small_lwe_vector,
lwe_indexes_in, bsk, luts_message_carry->buffer,
glwe_dimension, lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count, 2, 0,
max_shared_memory, mem_ptr->params.pbs_type);
luts_message_carry->release(streams[0], gpu_indexes[0]);
execute_pbs<Torus>(
stream, new_blocks, lwe_indexes_out, luts_message_carry->lut,
luts_message_carry->lut_indexes, small_lwe_vector, lwe_indexes_in, bsk,
luts_message_carry->buffer, glwe_dimension, lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count,
2, 0, max_shared_memory, mem_ptr->params.pbs_type);
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
int new_blocks_created = 2 * ch_amount * num_blocks;
@@ -349,29 +297,26 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, streams[0],
gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
std::swap(new_blocks, old_blocks);
r = (new_blocks_created + rem_blocks) / num_blocks;
}
host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
host_addition(stream, radix_lwe_out, old_blocks,
&old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
num_blocks);
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
radix_lwe_out, mem_ptr->scp_mem, bsk, ksk,
num_blocks);
host_propagate_single_carry<Torus>(stream, radix_lwe_out, mem_ptr->scp_mem,
bsk, ksk, num_blocks);
}
template <typename Torus, typename STorus, class params>
__host__ void host_integer_mult_radix_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
@@ -438,14 +383,13 @@ __host__ void host_integer_mult_radix_kb(
dim3 grid(lsb_vector_block_count, 1, 1);
dim3 thds(params::degree / params::opt, 1, 1);
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
vector_lsb_rhs, vector_msb_rhs, num_blocks);
check_cuda_error(cudaGetLastError());
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
vector_result_sb, bsk, ksk, total_block_count, luts_array);
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
total_block_count, luts_array);
vector_result_lsb = &block_mul_res[0];
vector_result_msb = &block_mul_res[lsb_vector_block_count *
@@ -453,10 +397,10 @@ __host__ void host_integer_mult_radix_kb(
fill_radix_from_lsb_msb<Torus, params>
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
glwe_dimension, lsb_vector_block_count,
msb_vector_block_count, num_blocks);
check_cuda_error(cudaGetLastError());
stream->stream>>>(vector_result_sb, vector_result_lsb,
vector_result_msb, glwe_dimension,
lsb_vector_block_count, msb_vector_block_count,
num_blocks);
int terms_degree[2 * num_blocks * num_blocks];
for (int i = 0; i < num_blocks * num_blocks; i++) {
@@ -472,35 +416,25 @@ __host__ void host_integer_mult_radix_kb(
}
host_integer_sum_ciphertexts_vec_kb<Torus, params>(
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
terms_degree, bsk, ksk, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks);
stream, radix_lwe_out, vector_result_sb, terms_degree, bsk, ksk,
mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
}
template <typename Torus>
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
cudaStream_t stream, uint32_t gpu_index, int_mul_memory<Torus> **mem_ptr,
cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
*mem_ptr = new int_mul_memory<Torus>(stream, gpu_index, params,
num_radix_blocks, allocate_gpu_memory);
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
allocate_gpu_memory);
}
#endif

View File

@@ -1,18 +1,18 @@
#include "integer/negation.cuh"
void cuda_negate_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
host_integer_radix_negation(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
static_cast<uint64_t *>(lwe_array), lwe_dimension,
lwe_ciphertext_count, message_modulus,
carry_modulus);
}
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -25,24 +25,21 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64(
message_modulus, carry_modulus);
scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
stream, (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks,
params, allocate_gpu_memory);
}
void cuda_integer_radix_overflowing_sub_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
void *radix_lwe_right, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_blocks) {
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_overflowed,
void *radix_lwe_left, void *radix_lwe_right, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t num_blocks) {
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
switch (mem->params.polynomial_size) {
case 512:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
@@ -50,8 +47,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
break;
case 1024:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
@@ -59,8 +55,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
break;
case 2048:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
@@ -68,8 +63,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
break;
case 4096:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
@@ -77,8 +71,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
break;
case 8192:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
@@ -86,8 +79,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
break;
case 16384:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
stream, static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsk,
@@ -99,11 +91,10 @@ void cuda_integer_radix_overflowing_sub_kb_64(
}
}
void cleanup_cuda_integer_radix_overflowing_sub(void *stream,
uint32_t gpu_index,
void cleanup_cuda_integer_radix_overflowing_sub(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_overflowing_sub_memory<uint64_t> *mem_ptr =
(int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -58,13 +58,12 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
}
template <typename Torus>
__host__ void
host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *output, Torus *input,
uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count,
uint64_t message_modulus, uint64_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
Torus *input, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count,
uint64_t message_modulus,
uint64_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
@@ -82,7 +81,7 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
carry_modulus, delta);
check_cuda_error(cudaGetLastError());
@@ -90,33 +89,30 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
template <typename Torus>
__host__ void scratch_cuda_integer_overflowing_sub_kb(
cudaStream_t stream, uint32_t gpu_index,
int_overflowing_sub_memory<Torus> **mem_ptr, uint32_t num_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cuda_stream_t *stream, int_overflowing_sub_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
*mem_ptr = new int_overflowing_sub_memory<Torus>(
stream, gpu_index, params, num_blocks, allocate_gpu_memory);
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_overflowing_sub_memory<Torus>(stream, params, num_blocks,
allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void host_integer_overflowing_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
Torus *radix_lwe_right, void *bsk, uint64_t *ksk,
cuda_stream_t *stream, Torus *radix_lwe_out, Torus *radix_lwe_overflowed,
Torus *radix_lwe_left, Torus *radix_lwe_right, void *bsk, uint64_t *ksk,
int_overflowing_sub_memory<uint64_t> *mem_ptr, uint32_t num_blocks) {
auto radix_params = mem_ptr->params;
host_unchecked_sub_with_correcting_term(
streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
radix_params.message_modulus, radix_params.carry_modulus,
radix_params.message_modulus - 1);
stream, radix_lwe_out, radix_lwe_left, radix_lwe_right,
radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
radix_params.carry_modulus, radix_params.message_modulus - 1);
host_propagate_single_sub_borrow<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out,
mem_ptr->borrow_prop_mem, bsk, ksk, num_blocks);
stream, radix_lwe_overflowed, radix_lwe_out, mem_ptr->borrow_prop_mem,
bsk, ksk, num_blocks);
}
#endif

View File

@@ -1,12 +1,12 @@
#include "integer/scalar_addition.cuh"
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
cuda_stream_t *stream, void *lwe_array, void *scalar_input,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
host_integer_radix_scalar_addition_inplace(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
stream, static_cast<uint64_t *>(lwe_array),
static_cast<uint64_t *>(scalar_input), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
}

View File

@@ -27,11 +27,10 @@ __global__ void device_integer_radix_scalar_addition_inplace(
template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
@@ -45,7 +44,8 @@ __host__ void host_integer_radix_scalar_addition_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
stream->stream>>>(
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());
@@ -65,11 +65,10 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
template <typename Torus>
__host__ void host_integer_radix_add_scalar_one_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t lwe_dimension,
cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
@@ -83,7 +82,8 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
stream->stream>>>(
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
}
@@ -104,11 +104,10 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
template <typename Torus>
__host__ void host_integer_radix_scalar_subtraction_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
cudaSetDevice(gpu_indexes[0]);
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
@@ -123,7 +122,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
streams[0]>>>(
stream->stream>>>(
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());

View File

@@ -1,14 +1,12 @@
#include "integer/scalar_bitops.cuh"
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
host_integer_radix_scalar_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_input),
static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),

View File

@@ -6,12 +6,12 @@
template <typename Torus>
__host__ void host_integer_radix_scalar_bitop_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks,
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, BITOP_TYPE op) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_input,
Torus *clear_blocks, uint32_t num_clear_blocks,
int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks, BITOP_TYPE op) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto lut = mem_ptr->lut;
auto params = lut->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -21,30 +21,28 @@ __host__ void host_integer_radix_scalar_bitop_kb(
if (num_clear_blocks == 0) {
if (op == SCALAR_BITAND) {
cuda_memset_async(lwe_array_out, 0,
num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
num_radix_blocks * lwe_size * sizeof(Torus), stream);
} else {
cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
num_radix_blocks * lwe_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
stream);
}
} else {
// We have all possible LUTs pre-computed and we use the decomposed scalar
// as index to recover the right one
cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
num_clear_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
num_clear_blocks * sizeof(Torus), stream);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsk,
ksk, num_clear_blocks, lut);
stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
lut);
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
cuda_memset_async(lwe_array_out_block, 0,
(num_radix_blocks - num_clear_blocks) * lwe_size *
sizeof(Torus),
streams[0], gpu_indexes[0]);
stream);
}
}
}

View File

@@ -1,10 +1,9 @@
#include "integer/scalar_comparison.cuh"
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count,
uint32_t num_scalar_blocks) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *scalar_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -12,8 +11,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
case EQ:
case NE:
host_integer_radix_scalar_equality_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
@@ -23,8 +21,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
case LT:
case LE:
host_integer_radix_scalar_difference_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer,
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
@@ -33,8 +30,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
case MAX:
case MIN:
host_integer_radix_scalar_maxmin_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);

View File

@@ -6,13 +6,12 @@
template <typename Torus>
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -47,10 +46,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
if (total_num_scalar_blocks == 0) {
// We only have to compare blocks with zero
// means scalar is zero
host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
mem_ptr->tmp_lwe_array_out, lwe_array_in,
mem_ptr, bsk, ksk, total_num_radix_blocks,
mem_ptr->is_zero_lut);
host_compare_with_zero_equality(
stream, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsk, ksk,
total_num_radix_blocks, mem_ptr->is_zero_lut);
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -59,13 +57,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
};
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_last_leaf_lut_f);
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, scalar_last_leaf_lut_f);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
} else if (total_num_scalar_blocks < total_num_radix_blocks) {
// We have to handle both part of the work described above
@@ -79,7 +76,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_synchronize_stream(stream);
auto lsb_stream = mem_ptr->lsb_stream;
auto msb_stream = mem_ptr->msb_stream;
@@ -93,10 +90,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -108,15 +105,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
comparisons, lhs, rhs, mem_ptr, bsk, ksk,
num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsk, ksk,
num_lsb_radix_blocks);
}
@@ -124,13 +120,13 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
{
//////////////
// msb
host_compare_with_zero_equality(
&msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, msb, mem_ptr,
bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut);
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
mem_ptr, bsk, ksk, num_msb_radix_blocks,
mem_ptr->is_zero_lut);
}
}
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
cuda_synchronize_stream(lsb_stream);
cuda_synchronize_stream(msb_stream);
//////////////
// Reduce the two blocks into one final
@@ -145,12 +141,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, scalar_bivariate_last_leaf_lut_f);
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
lwe_array_msb_out, bsk, ksk, 1, lut);
stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
1, lut);
} else {
// We only have to do the regular comparison
@@ -162,10 +158,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
num_scalar_blocks, message_modulus);
pack_blocks(stream, lhs, lwe_array_in, big_lwe_dimension,
num_lsb_radix_blocks, message_modulus);
pack_blocks(stream, rhs, scalar_blocks, 0, num_scalar_blocks,
message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -176,28 +172,26 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_lwe_array_out;
scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
lhs, rhs, mem_ptr, bsk, ksk,
num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk,
ksk, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
sign_handler_f, bsk, ksk, num_lsb_radix_blocks);
tree_sign_reduction(stream, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
ksk, num_lsb_radix_blocks);
}
}
template <typename Torus>
__host__ void integer_radix_signed_scalar_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -233,9 +227,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// We only have to compare blocks with zero
// means scalar is zero
Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
host_compare_with_zero_equality(
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
mem_ptr, bsk, ksk, total_num_radix_blocks, mem_ptr->is_zero_lut);
host_compare_with_zero_equality(stream, are_all_msb_zeros, lwe_array_in,
mem_ptr, bsk, ksk, total_num_radix_blocks,
mem_ptr->is_zero_lut);
Torus *sign_block =
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
@@ -276,12 +270,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, scalar_bivariate_last_leaf_lut_f);
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
sign_block, bsk, ksk, 1, lut);
stream, lwe_array_out, are_all_msb_zeros, sign_block, bsk, ksk, 1, lut);
} else if (total_num_scalar_blocks < total_num_radix_blocks) {
// We have to handle both part of the work described above
@@ -295,7 +288,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_synchronize_stream(stream);
auto lsb_stream = mem_ptr->lsb_stream;
auto msb_stream = mem_ptr->msb_stream;
@@ -309,10 +302,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -324,15 +317,14 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
comparisons, lhs, rhs, mem_ptr, bsk, ksk,
num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsk, ksk,
num_lsb_radix_blocks);
}
@@ -342,9 +334,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// msb
// We remove the last block (which is the sign)
Torus *are_all_msb_zeros = lwe_array_msb_out;
host_compare_with_zero_equality(
&msb_stream, &gpu_indexes[0], 1, are_all_msb_zeros, msb, mem_ptr,
bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut);
host_compare_with_zero_equality(msb_stream, are_all_msb_zeros, msb,
mem_ptr, bsk, ksk, num_msb_radix_blocks,
mem_ptr->is_zero_lut);
auto sign_bit_pos = (int)log2(message_modulus) - 1;
@@ -372,23 +364,23 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto signed_msb_lut = mem_ptr->signed_msb_lut;
generate_device_accumulator_bivariate<Torus>(
msb_stream, gpu_indexes[0], signed_msb_lut->lut,
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, lut_f);
msb_stream, signed_msb_lut->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f);
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
integer_radix_apply_bivariate_lookup_table_kb(
&msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, sign_block,
are_all_msb_zeros, bsk, ksk, 1, signed_msb_lut);
msb_stream, lwe_array_msb_out, sign_block, are_all_msb_zeros, bsk,
ksk, 1, signed_msb_lut);
}
}
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
cuda_synchronize_stream(lsb_stream);
cuda_synchronize_stream(msb_stream);
//////////////
// Reduce the two blocks into one final
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsk, ksk, 2);
reduce_signs(stream, lwe_array_out, lwe_array_lsb_out, mem_ptr,
sign_handler_f, bsk, ksk, 2);
} else {
// We only have to do the regular comparison
@@ -396,7 +388,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// total_num_radix_blocks == total_num_scalar_blocks
uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_synchronize_stream(stream);
auto lsb_stream = mem_ptr->lsb_stream;
auto msb_stream = mem_ptr->msb_stream;
@@ -411,11 +403,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks - 1,
message_modulus);
pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
num_lsb_radix_blocks - 1, message_modulus);
pack_blocks(lsb_stream, rhs, scalar_blocks, 0, num_lsb_radix_blocks - 1,
message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -424,9 +415,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
lwe_array_ct_out, lhs, rhs, mem_ptr, bsk,
ksk, num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb(lsb_stream, lwe_array_ct_out, lhs, rhs,
mem_ptr, bsk, ksk, num_lsb_radix_blocks);
}
#pragma omp section
{
@@ -436,36 +426,34 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
scalar_blocks + (total_num_scalar_blocks - 1);
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
create_trivial_radix(msb_stream, gpu_indexes[0], trivial_sign_block,
scalar_sign_block, big_lwe_dimension, 1, 1,
message_modulus, carry_modulus);
create_trivial_radix(msb_stream, trivial_sign_block, scalar_sign_block,
big_lwe_dimension, 1, 1, message_modulus,
carry_modulus);
integer_radix_apply_bivariate_lookup_table_kb(
&msb_stream, &gpu_indexes[0], 1, lwe_array_sign_out,
encrypted_sign_block, trivial_sign_block, bsk, ksk, 1,
mem_ptr->signed_lut);
msb_stream, lwe_array_sign_out, encrypted_sign_block,
trivial_sign_block, bsk, ksk, 1, mem_ptr->signed_lut);
}
}
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
cuda_synchronize_stream(lsb_stream);
cuda_synchronize_stream(msb_stream);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_ct_out, mem_ptr, sign_handler_f, bsk, ksk,
num_lsb_radix_blocks + 1);
reduce_signs(stream, lwe_array_out, lwe_array_ct_out, mem_ptr,
sign_handler_f, bsk, ksk, num_lsb_radix_blocks + 1);
}
}
template <typename Torus>
__host__ void integer_radix_signed_scalar_maxmin_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
// Calculates the difference sign between the ciphertext and the scalar
// - 0 if lhs < rhs
@@ -473,8 +461,8 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
// - 2 if lhs > rhs
auto sign = mem_ptr->tmp_lwe_array_out;
integer_radix_signed_scalar_difference_check_kb(
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
total_num_scalar_blocks);
// There is no optimized CMUX for scalars, so we convert to a trivial
@@ -482,70 +470,66 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
auto lwe_array_left = lwe_array_in;
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
scalar_blocks, params.big_lwe_dimension,
total_num_radix_blocks, total_num_scalar_blocks,
params.message_modulus, params.carry_modulus);
create_trivial_radix(stream, lwe_array_right, scalar_blocks,
params.big_lwe_dimension, total_num_radix_blocks,
total_num_scalar_blocks, params.message_modulus,
params.carry_modulus);
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
host_integer_radix_cmux_kb(stream, lwe_array_out, sign, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
total_num_radix_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
if (mem_ptr->is_signed) {
// is signed and scalar is positive
integer_radix_signed_scalar_difference_check_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk,
total_num_radix_blocks, total_num_scalar_blocks);
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
sign_handler_f, bsk, ksk, total_num_radix_blocks,
total_num_scalar_blocks);
} else {
integer_radix_unsigned_scalar_difference_check_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk,
total_num_radix_blocks, total_num_scalar_blocks);
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
sign_handler_f, bsk, ksk, total_num_radix_blocks,
total_num_scalar_blocks);
}
}
template <typename Torus>
__host__ void host_integer_radix_signed_scalar_maxmin_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
if (mem_ptr->is_signed) {
// is signed and scalar is positive
integer_radix_signed_scalar_maxmin_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks,
total_num_scalar_blocks);
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
total_num_radix_blocks, total_num_scalar_blocks);
} else {
integer_radix_unsigned_scalar_maxmin_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks,
total_num_scalar_blocks);
stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
total_num_radix_blocks, total_num_scalar_blocks);
}
}
template <typename Torus>
__host__ void
scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -565,38 +549,37 @@ scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
subtracted_blocks, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_integer_radix_scalar_subtraction_inplace(
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
stream, subtracted_blocks, scalar_blocks, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
// Apply LUT to compare to 0
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsk,
ksk, num_radix_blocks, sign_lut);
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
subtracted_blocks, bsk, ksk,
num_radix_blocks, sign_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace(
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
big_lwe_dimension, num_radix_blocks,
message_modulus, carry_modulus);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_maxmin_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
// Calculates the difference sign between the ciphertext and the scalar
@@ -605,8 +588,8 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
// - 2 if lhs > rhs
auto sign = mem_ptr->tmp_lwe_array_out;
host_integer_radix_scalar_difference_check_kb(
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
total_num_scalar_blocks);
// There is no optimized CMUX for scalars, so we convert to a trivial
@@ -614,27 +597,24 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
auto lwe_array_left = lwe_array_in;
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
scalar_blocks, params.big_lwe_dimension,
total_num_radix_blocks, total_num_scalar_blocks,
params.message_modulus, params.carry_modulus);
create_trivial_radix(stream, lwe_array_right, scalar_blocks,
params.big_lwe_dimension, total_num_radix_blocks,
total_num_scalar_blocks, params.message_modulus,
params.carry_modulus);
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
total_num_radix_blocks);
host_integer_radix_cmux_kb(
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_equality_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -661,7 +641,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
auto lwe_array_msb_out =
lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_synchronize_stream(stream);
auto lsb_stream = mem_ptr->lsb_stream;
auto msb_stream = mem_ptr->msb_stream;
@@ -676,19 +656,18 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
auto packed_scalar =
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
pack_blocks(lsb_stream, gpu_indexes[0], packed_blocks, lsb,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, gpu_indexes[0], packed_scalar, scalar_blocks, 0,
pack_blocks(lsb_stream, packed_blocks, lsb, big_lwe_dimension,
num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, packed_scalar, scalar_blocks, 0,
num_scalar_blocks, message_modulus);
cuda_memcpy_async_gpu_to_gpu(scalar_comparison_luts->lut_indexes,
packed_scalar,
num_halved_scalar_blocks * sizeof(Torus),
lsb_stream, gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->lut_indexes, packed_scalar,
num_halved_scalar_blocks * sizeof(Torus), lsb_stream);
integer_radix_apply_univariate_lookup_table_kb(
&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out, packed_blocks,
bsk, ksk, num_halved_lsb_radix_blocks, scalar_comparison_luts);
lsb_stream, lwe_array_lsb_out, packed_blocks, bsk, ksk,
num_halved_lsb_radix_blocks, scalar_comparison_luts);
}
}
#pragma omp section
@@ -708,27 +687,25 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
PANIC("Cuda error: integer operation not supported")
}
host_compare_with_zero_equality(&msb_stream, &gpu_indexes[0], 1,
lwe_array_msb_out, msb, mem_ptr, bsk,
ksk, num_msb_radix_blocks, msb_lut);
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
mem_ptr, bsk, ksk, num_msb_radix_blocks,
msb_lut);
}
}
}
cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
cuda_synchronize_stream(lsb_stream);
cuda_synchronize_stream(msb_stream);
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
are_all_comparisons_block_true(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
mem_ptr, bsk, ksk,
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
case COMPARISON_TYPE::NE:
is_at_least_one_comparisons_block_true(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
mem_ptr, bsk, ksk,
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
default:

View File

@@ -1,7 +1,7 @@
#include "integer/scalar_mul.cuh"
void scratch_cuda_integer_scalar_mul_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -13,22 +13,20 @@ void scratch_cuda_integer_scalar_mul_kb_64(
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
stream, (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem,
void *bsk, void *ksk, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) {
cuda_stream_t *stream, void *lwe_array, uint64_t *decomposed_scalar,
uint64_t *has_at_least_one_set, int8_t *mem, void *bsk, void *ksk,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t num_blocks, uint32_t num_scalars) {
switch (polynomial_size) {
case 512:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
@@ -36,8 +34,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
break;
case 1024:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
@@ -45,8 +42,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
break;
case 2048:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
@@ -54,8 +50,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
break;
case 4096:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
@@ -63,8 +58,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
break;
case 8192:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
@@ -72,8 +66,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
break;
case 16384:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), decomposed_scalar,
stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
@@ -85,12 +78,12 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
}
}
void cleanup_cuda_integer_radix_scalar_mul(void *stream, uint32_t gpu_index,
void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
int_scalar_mul_buffer<uint64_t> *mem_ptr =
(int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -29,43 +29,33 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
template <typename T>
__host__ void scratch_cuda_integer_radix_scalar_mul_kb(
cudaStream_t stream, uint32_t gpu_index, int_scalar_mul_buffer<T> **mem_ptr,
cuda_stream_t *stream, int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
*mem_ptr = new int_scalar_mul_buffer<T>(
stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory);
*mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
allocate_gpu_memory);
}
template <typename T, class params>
__host__ void host_integer_scalar_mul_radix(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set,
int_scalar_mul_buffer<T> *mem, void *bsk, T *ksk,
cuda_stream_t *stream, T *lwe_array, T *decomposed_scalar,
T *has_at_least_one_set, int_scalar_mul_buffer<T> *mem, void *bsk, T *ksk,
uint32_t input_lwe_dimension, uint32_t message_modulus,
uint32_t num_radix_blocks, uint32_t num_scalars) {
if (num_radix_blocks == 0 | num_scalars == 0)
return;
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
uint32_t lwe_size = input_lwe_dimension + 1;
@@ -80,15 +70,13 @@ __host__ void host_integer_scalar_mul_radix(
T *ptr = preshifted_buffer + shift_amount * lwe_size * num_radix_blocks;
if (has_at_least_one_set[shift_amount] == 1) {
cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
lwe_size_bytes * num_radix_blocks,
streams[0], gpu_indexes[0]);
lwe_size_bytes * num_radix_blocks, stream);
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, ptr, shift_amount,
mem->logical_scalar_shift_buffer, bsk, ksk, num_radix_blocks);
stream, ptr, shift_amount, mem->logical_scalar_shift_buffer, bsk, ksk,
num_radix_blocks);
} else {
// create trivial assign for value = 0
cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, streams[0],
gpu_indexes[0]);
cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, stream);
}
}
size_t j = 0;
@@ -99,40 +87,37 @@ __host__ void host_integer_scalar_mul_radix(
preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
T *block_shift_buffer =
all_shifted_buffer + j * num_radix_blocks * lwe_size;
radix_blocks_rotate_right<<<num_radix_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<num_radix_blocks, 256, 0, stream->stream>>>(
block_shift_buffer, preshifted_radix_ct, i / msg_bits,
num_radix_blocks, lwe_size);
// create trivial assign for value = 0
cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
streams[0], gpu_indexes[0]);
stream);
j++;
}
}
if (j == 0) {
// lwe array = 0
cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes,
streams[0], gpu_indexes[0]);
cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes, stream);
} else {
int terms_degree[j * num_radix_blocks];
for (int i = 0; i < j * num_radix_blocks; i++) {
terms_degree[i] = message_modulus - 1;
}
host_integer_sum_ciphertexts_vec_kb<T, params>(
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
terms_degree, bsk, ksk, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
j);
stream, lwe_array, all_shifted_buffer, terms_degree, bsk, ksk,
mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
}
}
// Small scalar_mul is used in shift/rotate
template <typename T>
__host__ void host_integer_small_scalar_mul_radix(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
T *output_lwe_array, T *input_lwe_array, T scalar,
cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
@@ -143,7 +128,7 @@ __host__ void host_integer_small_scalar_mul_radix(
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
device_small_scalar_radix_multiplication<<<grid, thds, 0, streams[0]>>>(
device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
input_lwe_ciphertext_count);
check_cuda_error(cudaGetLastError());

View File

@@ -1,7 +1,7 @@
#include "scalar_rotate.cuh"
void scratch_cuda_integer_radix_scalar_rotate_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -15,28 +15,27 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
message_modulus, carry_modulus);
scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
params, shift_type, allocate_gpu_memory);
}
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t n, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
void *lwe_array, uint32_t n,
int8_t *mem_ptr, void *bsk,
void *ksk,
uint32_t num_blocks) {
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), n,
stream, static_cast<uint64_t *>(lwe_array), n,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
static_cast<uint64_t *>(ksk), num_blocks);
}
void cleanup_cuda_integer_radix_scalar_rotate(void *stream, uint32_t gpu_index,
void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
cudaSetDevice(gpu_index);
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -13,25 +13,22 @@
template <typename Torus>
__host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
cudaStream_t stream, uint32_t gpu_index,
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory) {
cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
stream, gpu_index, shift_type, params, num_radix_blocks,
allocate_gpu_memory);
stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer<Torus> *mem,
void *bsk, Torus *ksk, uint32_t num_blocks) {
cudaSetDevice(gpu_indexes[0]);
cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
uint32_t num_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -60,12 +57,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
// block_count blocks will be used in the grid
// one block is responsible to process single lwe ciphertext
if (mem->shift_type == LEFT_SHIFT) {
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0) {
return;
@@ -73,21 +69,20 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsk, ksk, num_blocks, lut_bivariate);
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
lut_bivariate);
} else {
// left shift
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0) {
return;
@@ -95,12 +90,12 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsk, ksk, num_blocks, lut_bivariate);
stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
lut_bivariate);
}
}

View File

@@ -1,7 +1,7 @@
#include "scalar_shifts.cuh"
void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -15,9 +15,8 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
message_modulus, carry_modulus);
scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
params, shift_type, allocate_gpu_memory);
}
/// The logical scalar shift is the one used for unsigned integers, and
@@ -25,19 +24,17 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
/// the application of a PBS onto the rotated blocks up to num_blocks -
/// rotations - 1 The remaining blocks are padded with zeros
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_blocks) {
cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
void *bsk, void *ksk, uint32_t num_blocks) {
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), shift,
stream, static_cast<uint64_t *>(lwe_array), shift,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
static_cast<uint64_t *>(ksk), num_blocks);
}
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -51,9 +48,8 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
message_modulus, carry_modulus);
scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
params, shift_type, allocate_gpu_memory);
stream, (int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr,
num_blocks, params, shift_type, allocate_gpu_memory);
}
/// The arithmetic scalar shift is the one used for the signed right shift.
@@ -64,35 +60,31 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
/// block, which is copied onto all remaining blocks instead of padding with
/// zeros as would be done in the logical shift.
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_blocks) {
cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
void *bsk, void *ksk, uint32_t num_blocks) {
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), shift,
stream, static_cast<uint64_t *>(lwe_array), shift,
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
static_cast<uint64_t *>(ksk), num_blocks);
}
void cleanup_cuda_integer_radix_logical_scalar_shift(void *stream,
uint32_t gpu_index,
void cleanup_cuda_integer_radix_logical_scalar_shift(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void *stream,
uint32_t gpu_index,
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -1,5 +1,5 @@
#ifndef CUDA_INTEGER_SCALAR_SHIFT_CUH
#define CUDA_INTEGER_SCALAR_SHIFT_CUH
#ifndef CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
#define CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
@@ -14,26 +14,22 @@
template <typename Torus>
__host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
cudaStream_t stream, uint32_t gpu_index,
int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory) {
cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
stream, gpu_index, shift_type, params, num_radix_blocks,
allocate_gpu_memory);
stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t shift,
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
uint32_t num_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -63,15 +59,14 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
// block_count blocks will be used in the grid
// one block is responsible to process single lwe ciphertext
if (mem->shift_type == LEFT_SHIFT) {
radix_blocks_rotate_right<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
// create trivial assign for value = 0
cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
stream);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0 || rotations == num_blocks) {
return;
@@ -84,23 +79,20 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, partial_current_blocks,
partial_current_blocks, partial_previous_blocks, bsk, ksk,
partial_block_count, lut_bivariate);
stream, partial_current_blocks, partial_current_blocks,
partial_previous_blocks, bsk, ksk, partial_block_count, lut_bivariate);
} else {
// right shift
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
// rotate left as the blocks are from LSB to MSB
// create trivial assign for value = 0
cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size,
0, rotations * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
0, rotations * big_lwe_size_bytes, stream);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
num_blocks * big_lwe_size_bytes, stream);
if (shift_within_block == 0 || rotations == num_blocks) {
return;
@@ -112,34 +104,29 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, partial_current_blocks,
partial_current_blocks, partial_next_blocks, bsk, ksk,
partial_block_count, lut_bivariate);
stream, partial_current_blocks, partial_current_blocks,
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
}
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
cudaStream_t stream, uint32_t gpu_index,
int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
cuda_stream_t *stream, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
stream, gpu_index, shift_type, params, num_radix_blocks,
allocate_gpu_memory);
stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, uint32_t shift,
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
uint32_t num_blocks) {
cudaSetDevice(gpu_indexes[0]);
cudaSetDevice(stream->gpu_index);
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -169,11 +156,10 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
if (mem->shift_type == RIGHT_SHIFT) {
radix_blocks_rotate_left<<<num_blocks, 256, 0, streams[0]>>>(
radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
num_blocks * big_lwe_size_bytes, stream);
if (num_bits_in_block == 1) {
// if there is only 1 bit in the msg part, it means shift_within block is
@@ -189,7 +175,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
rotated_buffer + (num_blocks - rotations) * big_lwe_size;
for (uint i = 0; i < num_blocks; i++) {
cuda_memcpy_async_gpu_to_gpu(block_dest, block_src, big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
stream);
block_dest += big_lwe_size;
}
return;
@@ -199,49 +185,47 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
// bit. This creates the need for a different shifting lut than in the
// logical shift case. We also need another PBS to create the padding block.
Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(
last_block_copy,
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(last_block_copy,
rotated_buffer + (num_blocks - rotations - 1) *
big_lwe_size,
big_lwe_size_bytes, stream);
auto partial_current_blocks = lwe_array;
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
size_t partial_block_count = num_blocks - rotations;
if (shift_within_block != 0 && rotations != num_blocks) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, partial_current_blocks,
partial_current_blocks, partial_next_blocks, bsk, ksk,
partial_block_count, lut_bivariate);
stream, partial_current_blocks, partial_current_blocks,
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
}
// Since our CPU threads will be working on different streams we shall
// assert the work in the main stream is completed
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
stream->synchronize();
#pragma omp parallel sections
{
// All sections may be executed in parallel
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
&mem->local_stream_1, &gpu_indexes[0], 1, padding_block,
last_block_copy, bsk, ksk, 1, lut_univariate_padding_block);
mem->local_stream_1, padding_block, last_block_copy, bsk, ksk, 1,
lut_univariate_padding_block);
// Replace blocks 'pulled' from the left with the correct padding block
for (uint i = 0; i < rotations; i++) {
cuda_memcpy_async_gpu_to_gpu(
lwe_array + (num_blocks - rotations + i) * big_lwe_size,
padding_block, big_lwe_size_bytes, mem->local_stream_1,
gpu_indexes[0]);
padding_block, big_lwe_size_bytes, mem->local_stream_1);
}
}
#pragma omp section
{
if (shift_within_block != 0 && rotations != num_blocks) {
integer_radix_apply_univariate_lookup_table_kb(
&mem->local_stream_2, &gpu_indexes[0], 1, last_block,
last_block_copy, bsk, ksk, 1, lut_univariate_shift_last_block);
mem->local_stream_2, last_block, last_block_copy, bsk, ksk, 1,
lut_univariate_shift_last_block);
}
}
}
cuda_synchronize_stream(mem->local_stream_1, gpu_indexes[0]);
cuda_synchronize_stream(mem->local_stream_2, gpu_indexes[0]);
cuda_synchronize_stream(mem->local_stream_1);
cuda_synchronize_stream(mem->local_stream_2);
} else {
PANIC("Cuda error (scalar shift): left scalar shift is never of the "
@@ -249,4 +233,4 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
}
}
#endif // CUDA_SCALAR_SHIFT_CUH
#endif // CUDA_SCALAR_OPS_CUH

View File

@@ -1,7 +1,7 @@
#include "shift_and_rotate.cuh"
void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -15,29 +15,26 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
message_modulus, carry_modulus);
scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
(int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, is_signed, allocate_gpu_memory);
stream, (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks,
params, shift_type, is_signed, allocate_gpu_memory);
}
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *lwe_shift, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_blocks) {
cuda_stream_t *stream, void *lwe_array, void *lwe_shift, int8_t *mem_ptr,
void *bsk, void *ksk, uint32_t num_blocks) {
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_shift),
stream, static_cast<uint64_t *>(lwe_array),
static_cast<uint64_t *>(lwe_shift),
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsk,
static_cast<uint64_t *>(ksk), num_blocks);
}
void cleanup_cuda_integer_radix_shift_and_rotate(void *stream,
uint32_t gpu_index,
void cleanup_cuda_integer_radix_shift_and_rotate(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
cudaSetDevice(gpu_index);
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
(int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
mem_ptr->release(stream);
}

View File

@@ -14,37 +14,33 @@
template <typename Torus>
__host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
cudaStream_t stream, uint32_t gpu_index,
int_shift_and_rotate_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cuda_stream_t *stream, int_shift_and_rotate_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
stream, gpu_index, shift_type, is_signed, params, num_radix_blocks,
stream, shift_type, is_signed, params, num_radix_blocks,
allocate_gpu_memory);
}
template <typename Torus>
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer<Torus> *mem,
void *bsk, Torus *ksk, uint32_t num_radix_blocks) {
cuda_stream_t *stream, Torus *lwe_array, Torus *lwe_shift,
int_shift_and_rotate_buffer<Torus> *mem, void *bsk, Torus *ksk,
uint32_t num_radix_blocks) {
uint32_t bits_per_block = std::log2(mem->params.message_modulus);
uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
if (total_nb_bits == 0)
return;
auto big_lwe_dimension = mem->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
cudaSetDevice(gpu_indexes[0]);
assert(total_nb_bits > 0);
// Extract all bits
auto bits = mem->tmp_bits;
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsk,
ksk, num_radix_blocks, bits_per_block,
mem->bit_extract_luts);
extract_n_bits<Torus>(stream, bits, lwe_array, bsk, ksk, num_radix_blocks,
bits_per_block, mem->bit_extract_luts);
// Extract shift bits
auto shift_bits = mem->tmp_shift_bits;
@@ -63,8 +59,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// Extracts bits and put them in the bit index 2 (=> bit number 3)
// so that it is already aligned to the correct position of the cmux input
// and we reduce noise growth
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
bsk, ksk, 1, max_num_bits_that_tell_shift,
extract_n_bits<Torus>(stream, shift_bits, lwe_shift, bsk, ksk, 1,
max_num_bits_that_tell_shift,
mem->bit_extract_luts_with_offset_2);
// If signed, do an "arithmetic shift" by padding with the sign bit
@@ -78,50 +74,47 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
auto mux_inputs = mem->tmp_mux_inputs;
cuda_memcpy_async_gpu_to_gpu(input_bits_a, bits,
total_nb_bits * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
total_nb_bits * big_lwe_size_bytes, stream);
for (int d = 0; d < max_num_bits_that_tell_shift; d++) {
auto shift_bit = shift_bits + d * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(input_bits_b, input_bits_a,
total_nb_bits * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
total_nb_bits * big_lwe_size_bytes, stream);
auto rotations = 1 << d;
switch (mem->shift_type) {
case LEFT_SHIFT:
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, stream->stream>>>(
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
for (int i = 0; i < rotations; i++)
cuda_memcpy_async_gpu_to_gpu(rotated_input + i * big_lwe_size,
last_bit, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
last_bit, big_lwe_size_bytes, stream);
else
cuda_memset_async(rotated_input, 0, rotations * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
stream);
break;
case RIGHT_SHIFT:
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, streams[0]>>>(
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, stream->stream>>>(
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
if (mem->is_signed)
for (int i = 0; i < rotations; i++)
cuda_memcpy_async_gpu_to_gpu(
rotated_input + (total_nb_bits - rotations + i) * big_lwe_size,
last_bit, big_lwe_size_bytes, streams[0], gpu_indexes[0]);
last_bit, big_lwe_size_bytes, stream);
else
cuda_memset_async(
rotated_input + (total_nb_bits - rotations) * big_lwe_size, 0,
rotations * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
cuda_memset_async(rotated_input +
(total_nb_bits - rotations) * big_lwe_size,
0, rotations * big_lwe_size_bytes, stream);
break;
case LEFT_ROTATE:
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, streams[0]>>>(
radix_blocks_rotate_right<<<total_nb_bits, 256, 0, stream->stream>>>(
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
break;
case RIGHT_ROTATE:
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, streams[0]>>>(
radix_blocks_rotate_left<<<total_nb_bits, 256, 0, stream->stream>>>(
rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
break;
default:
@@ -131,23 +124,21 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// pack bits into one block so that we have
// control_bit|b|a
cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
streams[0], gpu_indexes[0]); // Do we need this?
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
total_nb_bits);
stream); // Do we need this?
pack_bivariate_blocks(stream, mux_inputs, mux_lut->lwe_indexes_out,
rotated_input, input_bits_a, mux_lut->lwe_indexes_in,
big_lwe_dimension, 2, total_nb_bits);
// The shift bit is already properly aligned/positioned
for (int i = 0; i < total_nb_bits; i++)
host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
host_addition(stream, mux_inputs + i * big_lwe_size,
mux_inputs + i * big_lwe_size, shift_bit,
mem->params.big_lwe_dimension, 1);
// we have
// control_bit|b|a
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsk, ksk,
total_nb_bits, mux_lut);
stream, input_bits_a, mux_inputs, bsk, ksk, total_nb_bits, mux_lut);
}
// Initializes the output
@@ -156,7 +147,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
last_bit = input_bits_a + (bits_per_block - 1) * big_lwe_size;
for (int i = 0; i < num_radix_blocks; i++) {
cuda_memcpy_async_gpu_to_gpu(lwe_last_out, last_bit, big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
stream);
lwe_last_out += big_lwe_size;
last_bit += bits_per_block * big_lwe_size;
@@ -167,15 +158,14 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
for (int i = bits_per_block - 2; i >= 0; i--) {
host_integer_small_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, 2,
big_lwe_dimension, num_radix_blocks);
stream, lwe_last_out, lwe_last_out, 2, big_lwe_dimension,
num_radix_blocks);
auto block = lwe_last_out;
auto bit_to_add = input_bits_a + i * big_lwe_size;
for (int j = 0; j < num_radix_blocks; j++) {
host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
big_lwe_dimension, 1);
host_addition(stream, block, block, bit_to_add, big_lwe_dimension, 1);
block += big_lwe_size;
bit_to_add += bits_per_block * big_lwe_size;
@@ -184,8 +174,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// To give back a clean ciphertext
auto cleaning_lut = mem->cleaning_lut;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsk, ksk,
num_radix_blocks, cleaning_lut);
stream, lwe_last_out, lwe_last_out, bsk, ksk, num_radix_blocks,
cleaning_lut);
}
}
#endif

View File

@@ -4,15 +4,14 @@
* Perform the addition of two u32 input LWE ciphertext vectors.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
host_addition(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in_1),
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
@@ -44,15 +43,14 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
* vectors are left unchanged. This function is a wrapper to a device function
* that performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
host_addition(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in_1),
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
@@ -62,12 +60,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
* plaintext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
host_addition_plaintext(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
@@ -101,12 +98,11 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
* performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
host_addition_plaintext(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);

View File

@@ -6,10 +6,9 @@
#include <cuda_runtime.h>
#endif
#include "../utils/kernel_dimensions.cuh"
#include "device.h"
#include "helper.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
template <typename T>
@@ -28,22 +27,21 @@ __global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
}
template <typename T>
__host__ void
host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
T *lwe_input, T *plaintext_input,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
__host__ void host_addition_plaintext(cuda_stream_t *stream, T *output,
T *lwe_input, T *plaintext_input,
uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
(lwe_dimension + 1) * lwe_ciphertext_count,
stream, gpu_index);
plaintext_addition<<<grid, thds, 0, stream>>>(
cuda_memcpy_async_gpu_to_gpu(
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count, stream);
plaintext_addition<<<grid, thds, 0, stream->stream>>>(
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -62,12 +60,11 @@ __global__ void addition(T *output, T *input_1, T *input_2,
// Coefficient-wise addition
template <typename T>
__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
T *input_1, T *input_2,
uint32_t input_lwe_dimension,
__host__ void host_addition(cuda_stream_t *stream, T *output, T *input_1,
T *input_2, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
@@ -78,7 +75,8 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
addition<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -96,12 +94,11 @@ __global__ void subtraction(T *output, T *input_1, T *input_2,
// Coefficient-wise subtraction
template <typename T>
__host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
T *output, T *input_1, T *input_2,
uint32_t input_lwe_dimension,
__host__ void host_subtraction(cuda_stream_t *stream, T *output, T *input_1,
T *input_2, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
@@ -112,7 +109,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
subtraction<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -132,13 +130,12 @@ __global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
}
template <typename T>
__host__ void host_subtraction_plaintext(cudaStream_t stream,
uint32_t gpu_index, T *output,
__host__ void host_subtraction_plaintext(cuda_stream_t *stream, T *output,
T *lwe_input, T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -148,9 +145,9 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
input_lwe_ciphertext_count *
(input_lwe_dimension + 1) * sizeof(T),
stream, gpu_index);
stream);
radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
radix_body_subtraction_inplace<<<grid, thds, 0, stream->stream>>>(
output, plaintext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -178,11 +175,11 @@ __global__ void unchecked_sub_with_correcting_term(
template <typename T>
__host__ void host_unchecked_sub_with_correcting_term(
cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
cuda_stream_t *stream, T *output, T *input_1, T *input_2,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
@@ -193,7 +190,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
unchecked_sub_with_correcting_term<<<grid, thds, 0, stream->stream>>>(
output, input_1, input_2, num_entries, lwe_size, message_modulus,
carry_modulus, degree);
check_cuda_error(cudaGetLastError());

View File

@@ -5,12 +5,11 @@
* cleartext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
host_cleartext_multiplication(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in),
input_lwe_dimension,
@@ -45,12 +44,11 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
* function that performs the operation on the GPU.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
host_cleartext_multiplication(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in),
input_lwe_dimension,

View File

@@ -29,12 +29,11 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
template <typename T>
__host__ void
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
T *output, T *lwe_input, T *cleartext_input,
uint32_t input_lwe_dimension,
host_cleartext_multiplication(cuda_stream_t *stream, T *output, T *lwe_input,
T *cleartext_input, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
@@ -45,7 +44,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cleartext_multiplication<<<grid, thds, 0, stream>>>(
cleartext_multiplication<<<grid, thds, 0, stream->stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}

View File

@@ -4,14 +4,13 @@
* Perform the negation of a u32 input LWE ciphertext vector.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
host_negation(stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
@@ -38,14 +37,13 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
* device function that performs the operation on the GPU.
*/
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
host_negation(stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -22,11 +22,11 @@ __global__ void negation(T *output, T *input, uint32_t num_entries) {
}
template <typename T>
__host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
T *input, uint32_t input_lwe_dimension,
__host__ void host_negation(cuda_stream_t *stream, T *output, T *input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
negation<<<grid, thds, 0, stream->stream>>>(output, input, num_entries);
check_cuda_error(cudaGetLastError());
}

View File

@@ -1,38 +1,36 @@
#include "bootstrapping_key.cuh"
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_programmable_bootstrap_key<uint32_t, int32_t>(
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
(int32_t *)src, polynomial_size, total_polynomials);
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_programmable_bootstrap_key<uint64_t, int64_t>(
static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
(int64_t *)src, polynomial_size, total_polynomials);
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor) {
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
level_count * (1 << grouping_factor) /
grouping_factor;
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
static_cast<cudaStream_t>(stream), gpu_index);
stream);
}
// We need these lines so the compiler knows how to specialize these functions

View File

@@ -60,13 +60,12 @@ __device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
}
////////////////////////////////////////////////
template <typename T, typename ST>
void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
uint32_t gpu_index,
double2 *dest, ST *src,
uint32_t polynomial_size,
uint32_t total_polynomials) {
void cuda_convert_lwe_programmable_bootstrap_key(
double2 *dest, ST *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t total_polynomials) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
// Here the buffer size is the size of double2 times the number of polynomials
@@ -80,7 +79,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
double2 *h_bsk = (double2 *)malloc(buffer_size);
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream);
// compress real bsk to complex and divide it on DOUBLE_MAX
for (int i = 0; i < total_polynomials; i++) {
@@ -97,12 +96,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
}
}
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
double2 *buffer = (double2 *)cuda_malloc_async(0, stream);
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -110,17 +109,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -128,17 +127,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -146,17 +145,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -164,17 +163,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -182,17 +181,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -200,17 +199,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -218,13 +217,13 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
d_bsk, dest, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
}
break;
default:
@@ -232,17 +231,16 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
"N's are powers of two in the interval [256..16384].")
}
cuda_drop_async(d_bsk, stream, gpu_index);
cuda_drop_async(buffer, stream, gpu_index);
cuda_drop_async(d_bsk, stream);
cuda_drop_async(buffer, stream);
free(h_bsk);
}
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
void *_input1, void *_input2, void *_output,
void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
cuda_stream_t *stream,
uint32_t polynomial_size,
uint32_t total_polynomials) {
cudaSetDevice(gpu_index);
auto input1 = (double2 *)_input1;
auto input2 = (double2 *)_input2;
auto output = (double2 *)_output;
@@ -255,8 +253,8 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
@@ -266,18 +264,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
FULLSM>,
@@ -287,18 +286,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
@@ -308,18 +308,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
@@ -329,18 +330,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
@@ -350,18 +352,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
@@ -371,18 +374,19 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
@@ -393,19 +397,20 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
input1, input2, output, buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
shared_memory_size * total_polynomials, stream);
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
<<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
buffer);
}
break;
default:
break;
}
cuda_drop_async(buffer, stream, gpu_index);
cuda_drop_async(buffer, stream);
}
#endif // CNCRT_BSK_H

View File

@@ -1,44 +1 @@
#include "programmable_bootstrap.cuh"
template <>
__device__ int get_this_block_rank(grid_group &group, bool support_dsm) {
return blockIdx.y;
}
template <>
__device__ double2 *
get_join_buffer_element(int level_id, int glwe_id, grid_group &group,
double2 *global_memory_buffer, uint32_t polynomial_size,
uint32_t glwe_dimension, bool support_dsm) {
double2 *buffer_slice =
global_memory_buffer +
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
return buffer_slice;
}
#if CUDA_ARCH >= 900
template <>
__device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) {
if (support_dsm)
return cluster.block_rank();
else
return blockIdx.y;
}
template <>
__device__ double2 *
get_join_buffer_element(int level_id, int glwe_id, cluster_group &cluster,
double2 *global_memory_buffer, uint32_t polynomial_size,
uint32_t glwe_dimension, bool support_dsm) {
double2 *buffer_slice;
if (support_dsm) {
extern __shared__ double2 smem[];
buffer_slice = cluster.map_shared_rank(
smem, glwe_id + level_id * (glwe_dimension + 1));
} else {
buffer_slice =
global_memory_buffer +
(glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
}
return buffer_slice;
}
#endif

View File

@@ -1,124 +1,11 @@
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.h"
#include "cooperative_groups.h"
using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename G>
__device__ int get_this_block_rank(G &group, bool support_dsm);
template <typename G>
__device__ double2 *
get_join_buffer_element(int level_id, int glwe_id, G &group,
double2 *global_memory_buffer, uint32_t polynomial_size,
uint32_t glwe_dimension, bool support_dsm);
template <typename Torus, typename G, class params>
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
double2 *join_buffer, double2 *bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension,
int level_count, int iteration, G &group,
bool support_dsm = false) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
auto buffer_slice =
get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
polynomial_size, glwe_dimension, support_dsm);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
group.sync();
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
int idx = (j + this_block_rank) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
join_buffer, polynomial_size,
glwe_dimension, support_dsm);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
group.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
auto src_acc =
get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
polynomial_size, glwe_dimension, support_dsm);
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
join_buffer, polynomial_size,
glwe_dimension, support_dsm);
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
#include "../../include/device.h"
#include "../../include/programmable_bootstrap.h"
#include "../include/device.h"
#include "programmable_bootstrap_classic.cuh"
#include "programmable_bootstrap_multibit.cuh"
template <typename Torus>
void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, void *bootstrapping_key,
@@ -128,7 +15,6 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
auto num_inputs_on_gpu = input_lwe_ciphertext_count / gpu_count;
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
@@ -137,11 +23,11 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case CLASSICAL:
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
break;
@@ -154,19 +40,19 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory);
input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
break;
case CLASSICAL:
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.")
@@ -179,15 +65,13 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
}
template <typename Torus>
void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
if (gpu_index != 0)
PANIC("GPU error (pbs): all memory has to reside in GPU 0.")
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
@@ -196,9 +80,8 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case CLASSICAL:
scratch_cuda_programmable_bootstrap_32(
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.")
@@ -211,15 +94,14 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
scratch_cuda_multi_bit_programmable_bootstrap_64(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, grouping_factor, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
break;
case CLASSICAL:
scratch_cuda_programmable_bootstrap_64(
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.")
@@ -230,5 +112,3 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
"moduli are supported.")
}
}
#endif

View File

@@ -18,64 +18,57 @@ uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
* be used.
*/
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..16384].")
"N's are powers of two"
" in the interval [256..16384].")
}
}
@@ -86,60 +79,52 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
* be used.
*/
void scratch_cuda_programmable_bootstrap_amortized_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -152,13 +137,12 @@ void scratch_cuda_programmable_bootstrap_amortized_64(
* ciphertexts. See the corresponding operation on 64 bits for more details.
*/
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
if (base_log > 32)
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -167,66 +151,66 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 512:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 1024:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 2048:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 4096:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 8192:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 16384:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -301,13 +285,12 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
* values for the FFT
*/
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
if (base_log > 64)
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -316,66 +299,66 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 512:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 1024:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 2048:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 4096:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 8192:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 16384:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory);
stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -388,11 +371,9 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
* This cleanup function frees the data for the amortized PBS on GPU in
* buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
uint32_t gpu_index,
void cleanup_cuda_programmable_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer) {
check_cuda_error(cudaSetDevice(gpu_index));
// Free memory
cuda_drop_async(*pbs_buffer, static_cast<cudaStream_t>(stream), gpu_index);
cuda_drop_async(*pbs_buffer, stream);
}

View File

@@ -253,11 +253,10 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
template <typename Torus, typename STorus, typename params>
__host__ void scratch_programmable_bootstrap_amortized(
cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
@@ -285,22 +284,22 @@ __host__ void scratch_programmable_bootstrap_amortized(
get_buffer_size_programmable_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
}
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_amortized(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t SM_FULL =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
@@ -328,14 +327,14 @@ __host__ void host_programmable_bootstrap_amortized(
// of shared memory)
if (max_shared_memory < SM_PART) {
device_programmable_bootstrap_amortized<Torus, params, NOSM>
<<<grid, thds, 0, stream>>>(
<<<grid, thds, 0, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, lwe_idx, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, stream>>>(
<<<grid, thds, SM_PART, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
@@ -347,7 +346,7 @@ __host__ void host_programmable_bootstrap_amortized(
// For lower compute capabilities, this call
// just does nothing and the amount of shared memory used is 48 KB
device_programmable_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, stream>>>(
<<<grid, thds, SM_FULL, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,

View File

@@ -7,6 +7,7 @@
#endif
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
@@ -14,13 +15,104 @@
#include "fft/twiddles.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "types/complex/operations.cuh"
// Cooperative groups are used for this implementation
using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename Torus, class params>
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
double2 *join_buffer, double2 *bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension,
int level_count, int iteration,
grid_group &grid) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
// Selects all GLWEs in a particular decomposition level
auto level_join_buffer =
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
/*
* Kernel that computes the classical PBS using cooperative groups
*
@@ -130,7 +222,7 @@ __global__ void device_programmable_bootstrap_cg(
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, grid_group, params>(
mul_ggsw_glwe<Torus, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, grid);
@@ -154,13 +246,11 @@ __global__ void device_programmable_bootstrap_cg(
template <typename Torus, typename STorus, typename params>
__host__ void scratch_programmable_bootstrap_cg(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
@@ -186,7 +276,7 @@ __host__ void scratch_programmable_bootstrap_cg(
}
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
}
@@ -195,14 +285,14 @@ __host__ void scratch_programmable_bootstrap_cg(
*/
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_cg(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
@@ -242,18 +332,18 @@ __host__ void host_programmable_bootstrap_cg(
kernel_args[13] = &full_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, grid,
thds, (void **)kernel_args, 0, stream));
thds, (void **)kernel_args, 0, stream->stream));
} else if (max_shared_memory < full_sm) {
kernel_args[13] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
grid, thds, (void **)kernel_args, partial_sm, stream));
grid, thds, (void **)kernel_args, partial_sm, stream->stream));
} else {
int no_dm = 0;
kernel_args[13] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, grid,
thds, (void **)kernel_args, full_sm, stream));
thds, (void **)kernel_args, full_sm, stream->stream));
}
check_cuda_error(cudaGetLastError());

View File

@@ -1,5 +1,5 @@
#ifndef CUDA_CG_MULTIBIT_PBS_CUH
#define CUDA_CG_MULTIBIT_PBS_CUH
#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
#define CUDA_FAST_MULTIBIT_PBS_CUH
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
@@ -11,7 +11,6 @@
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
@@ -107,9 +106,9 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, grid_group, params>(
accumulator, accumulator_fft, block_join_buffer, keybundle,
polynomial_size, glwe_dimension, level_count, i, grid);
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
block_join_buffer, keybundle, polynomial_size,
glwe_dimension, level_count, i, grid);
synchronize_threads_in_block();
}
@@ -170,13 +169,13 @@ __host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
template <typename Torus, typename STorus, typename params>
__host__ void scratch_cg_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -241,26 +240,23 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
}
if (!lwe_chunk_size)
lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, max_shared_memory);
lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void execute_cg_external_product_loop(
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *lwe_array_out, Torus *lwe_output_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) {
__host__ void execute_external_product_loop(
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_chunk_size, uint32_t max_shared_memory,
int lwe_offset) {
cudaSetDevice(gpu_index);
uint64_t full_dm =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
@@ -310,55 +306,54 @@ __host__ void execute_cg_external_product_loop(
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, NOSM>,
grid_accumulate, thds, (void **)kernel_args, 0, stream));
grid_accumulate, thds, (void **)kernel_args, 0, stream->stream));
} else if (max_shared_memory < full_dm) {
kernel_args[19] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, PARTIALSM>,
grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
grid_accumulate, thds, (void **)kernel_args, partial_dm,
stream->stream));
} else {
kernel_args[19] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, FULLSM>,
grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
grid_accumulate, thds, (void **)kernel_args, full_dm, stream->stream));
}
}
template <typename Torus, typename STorus, class params>
__host__ void host_cg_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size = 0) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, num_samples, polynomial_size, max_shared_memory);
lwe_chunk_size = get_lwe_chunk_size(num_samples);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
stream, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, max_shared_memory,
lwe_chunk_size, lwe_offset);
// Accumulate
execute_cg_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_chunk_size,
max_shared_memory, lwe_offset);
execute_external_product_loop<Torus, params>(
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
lwe_chunk_size, max_shared_memory, lwe_offset);
}
}

View File

@@ -1,8 +1,5 @@
#include "programmable_bootstrap_cg_classic.cuh"
#include "programmable_bootstrap_classic.cuh"
#if (CUDA_ARCH >= 900)
#include "programmable_bootstrap_tbc_classic.cuh"
#endif
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
@@ -15,190 +12,6 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
max_shared_memory);
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
#if CUDA_ARCH >= 900
switch (polynomial_size) {
case 256:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 512:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 1024:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 2048:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 4096:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 8192:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 16384:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
#else
return false;
#endif
}
#if (CUDA_ARCH >= 900)
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
break;
case 512:
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
break;
case 1024:
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
break;
case 2048:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
break;
case 4096:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
break;
case 8192:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
break;
case 16384:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}
#endif
/*
* Returns the buffer size for 64 bits executions
*/
@@ -220,7 +33,7 @@ uint64_t get_buffer_size_programmable_bootstrap_64(
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
@@ -228,44 +41,37 @@ void scratch_cuda_programmable_bootstrap_cg(
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, level_count,
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
@@ -277,7 +83,7 @@ void scratch_cuda_programmable_bootstrap_cg(
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
@@ -285,45 +91,38 @@ void scratch_cuda_programmable_bootstrap(
switch (polynomial_size) {
case 256:
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 512:
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 1024:
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 2048:
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 4096:
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 8192:
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
stream, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -339,33 +138,23 @@ void scratch_cuda_programmable_bootstrap(
* be used.
*/
void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
#endif
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
else
scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
}
/*
@@ -374,40 +163,30 @@ void scratch_cuda_programmable_bootstrap_32(
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
*/
void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
else
#endif
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
else
scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
}
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
@@ -416,59 +195,52 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 512:
host_programmable_bootstrap_cg<Torus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 1024:
host_programmable_bootstrap_cg<Torus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 2048:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 4096:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 8192:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 16384:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -479,9 +251,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
@@ -490,59 +262,52 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 512:
host_programmable_bootstrap<Torus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 1024:
host_programmable_bootstrap<Torus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 2048:
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 4096:
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 8192:
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
case 16384:
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, max_shared_memory);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_luts, max_shared_memory);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -554,42 +319,22 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
if (base_log > 32)
PANIC("Cuda error (classical PBS): base log should be > number of bits "
"in the ciphertext representation (32)");
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
switch (buffer->pbs_variant) {
case TBC:
#if CUDA_ARCH >= 900
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key),
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
#endif
break;
case CG:
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory))
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
@@ -599,10 +344,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
break;
case DEFAULT:
else
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lut_vector),
static_cast<uint32_t *>(lut_vector_indexes),
@@ -612,10 +356,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
}
}
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
@@ -691,41 +431,21 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
* values for the FFT
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be > number of bits "
"in the ciphertext representation (64)");
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if (CUDA_ARCH >= 900)
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key),
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
#endif
break;
case PBS_VARIANT::CG:
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory))
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
@@ -735,10 +455,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
break;
case PBS_VARIANT::DEFAULT:
else
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
@@ -748,21 +467,16 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_luts, lwe_idx, max_shared_memory);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
}
}
/*
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
* bits inputs.
*/
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
void cleanup_cuda_programmable_bootstrap(cuda_stream_t *stream,
int8_t **buffer) {
cudaSetDevice(gpu_index);
auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(*buffer);
x->release(static_cast<cudaStream_t>(stream), gpu_index);
x->release(stream);
}
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
@@ -770,7 +484,7 @@ template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
uint32_t num_samples, uint32_t max_shared_memory);
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
@@ -780,7 +494,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
uint32_t lwe_idx, uint32_t max_shared_memory);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
@@ -790,20 +504,19 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint32_t lwe_idx, uint32_t max_shared_memory);
template void scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
cuda_stream_t *stream, pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template void scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
cuda_stream_t *stream, pbs_buffer<uint64_t, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
cuda_stream_t *stream, uint32_t *lwe_array_out,
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
@@ -813,7 +526,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
uint32_t lwe_idx, uint32_t max_shared_memory);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
cuda_stream_t *stream, uint32_t *lwe_array_out,
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
@@ -823,54 +536,13 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
uint32_t lwe_idx, uint32_t max_shared_memory);
template void scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
cuda_stream_t *stream, pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
uint32_t *lwe_output_indexes, uint32_t *lut_vector,
uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
cuda_stream_t *stream, pbs_buffer<uint32_t, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template void scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
#endif

View File

@@ -264,13 +264,11 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
template <typename Torus, typename STorus, typename params>
__host__ void scratch_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
@@ -320,42 +318,41 @@ __host__ void scratch_programmable_bootstrap(
}
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
double2 *bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
cudaSetDevice(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
if (max_shared_memory < partial_sm) {
device_programmable_bootstrap_step_one<Torus, params, NOSM>
<<<grid, thds, 0, stream>>>(
<<<grid, thds, 0, stream->stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
<<<grid, thds, partial_sm, stream->stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
<<<grid, thds, full_sm, stream->stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
@@ -366,36 +363,35 @@ __host__ void execute_step_one(
template <typename Torus, class params>
__host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
double2 *bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
cudaSetDevice(gpu_index);
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
if (max_shared_memory < partial_sm) {
device_programmable_bootstrap_step_two<Torus, params, NOSM>
<<<grid, thds, 0, stream>>>(
<<<grid, thds, 0, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
<<<grid, thds, partial_sm, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
<<<grid, thds, full_sm, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
@@ -408,14 +404,14 @@ __host__ void execute_step_two(
*/
template <typename Torus, class params>
__host__ void host_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
@@ -440,14 +436,13 @@ __host__ void host_programmable_bootstrap(
for (int i = 0; i < lwe_dimension; i++) {
execute_step_one<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one,
full_dm_step_one);
stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
execute_step_two<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, bootstrapping_key, global_accumulator,
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem,

View File

@@ -3,10 +3,6 @@
#include "programmable_bootstrap_multibit.cuh"
#include "programmable_bootstrap_multibit.h"
#if (CUDA_ARCH >= 900)
#include "programmable_bootstrap_tbc_multibit.cuh"
#endif
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory) {
@@ -15,62 +11,11 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
max_shared_memory);
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
#if CUDA_ARCH >= 900
switch (polynomial_size) {
case 256:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 512:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 1024:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 2048:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 4096:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 8192:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 16384:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
#else
return false;
#endif
}
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -85,65 +30,65 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
case 256:
host_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 8192:
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 16384:
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -154,9 +99,9 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -171,64 +116,64 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
case 256:
host_multi_bit_programmable_bootstrap<uint64_t, int64_t,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 512:
host_multi_bit_programmable_bootstrap<Torus, int64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 1024:
host_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 2048:
host_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 4096:
host_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 8192:
host_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 16384:
host_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -238,22 +183,19 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
}
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if CUDA_ARCH >= 900
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory))
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
@@ -263,43 +205,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
#else
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
break;
case PBS_VARIANT::CG:
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
case PBS_VARIANT::DEFAULT:
else
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
static_cast<uint64_t *>(bootstrapping_key),
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
}
}
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
@@ -307,50 +231,50 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
case 256:
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 512:
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 1024:
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 2048:
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 4096:
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 8192:
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 16384:
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
default:
@@ -362,7 +286,7 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
template <typename Torus, typename STorus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
@@ -372,58 +296,51 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
case 256:
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 512:
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 1024:
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 2048:
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 4096:
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 8192:
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 16384:
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -433,123 +350,65 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
}
void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size) {
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
else
#endif
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
uint64_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
else
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
int8_t **buffer) {
cudaSetDevice(gpu_index);
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
x->release(static_cast<cudaStream_t>(stream), gpu_index);
x->release(stream);
}
/**
* Computes divisors of the product of num_sms (streaming multiprocessors on the
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
* - 13) for additional logic.
*
* The value 13 was empirically determined based on memory requirements for
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t max_shared_memory) {
// Returns a chunk size that is not optimal but close to
__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count) {
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
int max_blocks_per_sm;
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
polynomial_size / params::opt, full_sm_keybundle);
else
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
polynomial_size / params::opt, 0);
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
int x = num_sms * max_blocks_per_sm;
int count = 0;
int divisor = 1;
int ith_divisor = 0;
#if CUDA_ARCH < 900
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
// multiplication can run
int log2_max_num_pbs = std::log2(max_num_pbs);
if (log2_max_num_pbs > 13)
ith_divisor = log2_max_num_pbs - 11;
#if CUDA_ARCH >= 900
// Tesla H100
return (ct_count > 10000) ? 30 : 64;
#elif CUDA_ARCH >= 890
// Tesla RTX4090
return 8;
#elif CUDA_ARCH >= 800
// Tesla A100
return (ct_count > 10000) ? 30 : 45;
#elif CUDA_ARCH >= 700
// Tesla V100
return (ct_count > 10000) ? 12 : 18;
#else
// Generic case
return (ct_count > 10000) ? 2 : 1;
#endif
for (int i = sqrt(x); i >= 1; i--) {
if (x % i == 0) {
if (count == ith_divisor) {
divisor = i;
break;
} else {
count++;
}
}
}
return divisor;
}
template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size);
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template void
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
@@ -561,15 +420,15 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
template void
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template void
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
@@ -578,187 +437,3 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);
template bool
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if (CUDA_ARCH >= 900)
template <typename Torus, typename STorus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
switch (polynomial_size) {
case 256:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
break;
case 512:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
break;
case 1024:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
break;
case 2048:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
break;
case 4096:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
break;
case 8192:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
break;
case 16384:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
allocate_gpu_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
"the ciphertext representation (64)");
switch (polynomial_size) {
case 256:
host_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
case 512:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
case 1024:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
case 2048:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
case 4096:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
case 8192:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
case 16384:
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
template void
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template void
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);
#endif

Some files were not shown because too many files have changed in this diff Show More