Compare commits

..

7 Commits

Author SHA1 Message Date
Andrei Stoian
3cf0d7895f fix(gpu): vector find lut 2026-02-25 17:34:22 +01:00
Andrei Stoian
07374cf3b7 fix(gpu): vector find lut number 2026-02-25 11:39:04 +01:00
Andrei Stoian
8493b609f1 fix(gpu): multiplication fix lut 2026-02-24 22:12:34 +01:00
Andrei Stoian
db6f7eec8f fix(gpu): protect lut apply to subset on shift 2026-02-24 18:15:55 +01:00
Andrei Stoian
f55d31f38c fix(gpu): avoid broadcast in comparison 2026-02-24 17:23:30 +01:00
Andrei Stoian
9041c8e602 fix(gpu): wrong assert 2026-02-24 10:53:00 +01:00
Andrei Stoian
f52b342db9 fix(gpu): protect lut re-use 2026-02-24 10:01:45 +01:00
151 changed files with 5987 additions and 11266 deletions

View File

@@ -68,12 +68,6 @@ runs:
echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
sha256sum -c checksum
sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
# Disable unattended-upgrades to avoid lock issues
sudo systemctl disable --now unattended-upgrades
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/*
sudo apt update
sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"

View File

@@ -14,7 +14,6 @@ on:
- signed_integer
- integer_compression
- integer_zk
- msm_zk
- shortint
- shortint_oprf
- hlapi_unsigned

View File

@@ -36,7 +36,7 @@ jobs:
uses: ./.github/workflows/benchmark_cpu_common.yml
if: inputs.run-cpu-benchmarks
with:
command: integer,hlapi_erc20
command: integer
op_flavor: fast_default
bench_type: both
precisions_set: documentation
@@ -50,40 +50,6 @@ jobs:
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-cpu-zk-server:
name: benchmark_documentation/run-benchmarks-cpu-zk-server
uses: ./.github/workflows/benchmark_cpu_common.yml
if: inputs.run-cpu-benchmarks
with:
command: integer_zk
op_flavor: default
bench_type: both
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-cpu-zk-client:
name: benchmark_documentation/run-benchmarks-cpu-zk-client
uses: ./.github/workflows/benchmark_wasm_client_common.yml
if: inputs.run-cpu-benchmarks
with:
browser: chrome
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-gpu-integer:
name: benchmark_documentation/run-benchmarks-gpu-integer
uses: ./.github/workflows/benchmark_gpu_common.yml
@@ -91,7 +57,7 @@ jobs:
with:
profile: multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
command: integer_multi_bit,hlapi_erc20
command: integer_multi_bit
op_flavor: fast_default
bench_type: both
precisions_set: documentation
@@ -110,7 +76,7 @@ jobs:
uses: ./.github/workflows/benchmark_hpu_common.yml
if: inputs.run-hpu-benchmarks
with:
command: integer,hlapi_erc20
command: integer
op_flavor: default
bench_type: both
precisions_set: documentation
@@ -172,7 +138,6 @@ jobs:
inputs.generate-svgs }}
needs: [
run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
]
uses: ./.github/workflows/generate_svgs.yml

View File

@@ -31,8 +31,6 @@ on:
- pbs128
- ks
- ks_pbs
- tfhe_zk_pok
- msm_zk
- integer_zk
- integer_aes
- integer_aes256

View File

@@ -58,19 +58,171 @@ jobs:
- tfhe/web_wasm_parallel_tests/**
- .github/workflows/wasm_client_benchmark.yml
run-benchmarks-cpu-zk-client:
name: benchmark_documentation/run-benchmarks-cpu-zk-client
uses: ./.github/workflows/benchmark_wasm_client_common.yml
needs: should-run
setup-instance:
name: benchmark_wasm_client/setup-instance
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
needs: should-run
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
wasm-client-benchmarks:
name: benchmark_wasm_client/wasm-client-benchmarks
needs: setup-instance
if: needs.setup-instance.result != 'skipped'
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
max-parallel: 1
matrix:
browser: [ chrome, firefox ]
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
- name: Get Node version
run: |
echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
~/.nvm
~/.npm
key: node-${{ env.NODE_VERSION }}
- name: Install Node
if: steps.node-cache.outputs.cache-hit != 'true'
run: |
make install_node
- name: Node cache save
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |
~/.nvm
~/.npm
key: node-${{ env.NODE_VERSION }}
- name: Install web resources
run: |
make install_"${BROWSER}"_browser
make install_"${BROWSER}"_web_driver
env:
BROWSER: ${{ matrix.browser }}
- name: Run benchmarks
run: |
make bench_web_js_api_parallel_"${BROWSER}"_ci
env:
BROWSER: ${{ matrix.browser }}
- name: Run benchmarks (unsafe coop)
run: |
make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
env:
BROWSER: ${{ matrix.browser }}
- name: Parse results
run: |
make parse_wasm_benchmarks
python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
--database tfhe_rs \
--hardware "m6i.4xlarge" \
--project-version "${COMMIT_HASH}" \
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--key-gen
rm tfhe-benchmark/wasm_pk_gen.csv
env:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
with:
name: ${{ github.sha }}_wasm_${{ matrix.browser }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: benchmark_wasm_client/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
needs: [ setup-instance, wasm-client-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,234 +0,0 @@
# Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
name: benchmark_wasm_client_common
on:
workflow_call:
inputs:
browser:
type: string # Use comma separated values to generate an array
default: chrome,firefox
secrets:
REPO_CHECKOUT_TOKEN:
required: true
SLAB_ACTION_TOKEN:
required: true
SLAB_BASE_URL:
required: true
SLAB_URL:
required: true
JOB_SECRET:
required: true
SLACK_CHANNEL:
required: true
BOT_USERNAME:
required: true
SLACK_WEBHOOK:
required: true
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members and GitHub can trigger this workflow
jobs:
prepare-matrix:
name: benchmark_wasm_client_common/prepare-matrix
runs-on: ubuntu-latest
outputs:
browser: ${{ steps.set_matrix_arg.outputs.browser }}
steps:
- name: Parse user inputs
shell: python
env:
INPUTS_BROWSER: ${{ inputs.browser }}
run: |
import os
inputs_browser = os.environ["INPUTS_BROWSER"]
env_file = os.environ["GITHUB_ENV"]
split_browser = inputs_browser.replace(" ", "").split(",")
with open(env_file, "a") as f:
f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")
- name: Set martix arguments output
id: set_matrix_arg
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
name: benchmark_wasm_client_common/setup-instance
needs: prepare-matrix
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
wasm-client-benchmarks:
name: benchmark_wasm_client_common/wasm-client-benchmarks
needs: [ prepare-matrix, setup-instance ]
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
max-parallel: 1
matrix:
browser: ${{ fromJSON(needs.prepare-matrix.outputs.browser) }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
- name: Get Node version
run: |
echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
~/.nvm
~/.npm
key: node-${{ env.NODE_VERSION }}
- name: Install Node
if: steps.node-cache.outputs.cache-hit != 'true'
run: |
make install_node
- name: Node cache save
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |
~/.nvm
~/.npm
key: node-${{ env.NODE_VERSION }}
- name: Install web resources
run: |
make install_"${BROWSER}"_browser
make install_"${BROWSER}"_web_driver
env:
BROWSER: ${{ matrix.browser }}
- name: Run benchmarks
run: |
make bench_web_js_api_parallel_"${BROWSER}"_ci
env:
BROWSER: ${{ matrix.browser }}
- name: Run benchmarks (unsafe coop)
run: |
make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
env:
BROWSER: ${{ matrix.browser }}
- name: Parse results
run: |
make parse_wasm_benchmarks
python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
--database tfhe_rs \
--hardware "m6i.4xlarge" \
--project-version "${COMMIT_HASH}" \
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--key-gen
rm tfhe-benchmark/wasm_pk_gen.csv
env:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
with:
name: ${{ github.sha }}_wasm_${{ matrix.browser }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: benchmark_wasm_client_common/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
needs: [ setup-instance, wasm-client-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -18,7 +18,7 @@ jobs:
- name: Check first line
uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
with:
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc|perf)(\([\w\-_]+\))?\!?\:) .+$'
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\!?\:) .+$'
flags: "gs"
error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
excludeDescription: "true" # optional: this excludes the description body of a pull request

View File

@@ -9,9 +9,6 @@ on:
type: string
layer:
type: string
bench_subset:
type: string
default: all
pbs_kind: # Valid values are 'classical', 'multi_bit' or 'any'
type: string
grouping_factor: # Valid values are 2, 3, or 4
@@ -19,9 +16,6 @@ on:
default: 4
bench_type: # Valid values are 'latency', 'throughput'
type: string
name_suffix:
type: string
default: _mean_avx512
backend_comparison:
type: boolean
default: false
@@ -66,8 +60,6 @@ jobs:
--pbs-kind "${PBS_KIND}" \
--grouping-factor "${GROUPING_FACTOR}" \
--bench-type "${BENCH_TYPE}" \
--bench-subset "${BENCH_SUBSET}" \
--name-suffix "${NAME_SUFFIX}" \
--time-span-days "${TIME_SPAN}"
env:
OUTPUT_FILENAME: ${{ inputs.output_filename }}
@@ -78,8 +70,6 @@ jobs:
PBS_KIND: ${{ inputs.pbs_kind }}
GROUPING_FACTOR: ${{ inputs.grouping_factor }}
BENCH_TYPE: ${{ inputs.bench_type }}
BENCH_SUBSET: ${{ inputs.bench_subset }}
NAME_SUFFIX: ${{ inputs.name_suffix }}
TIME_SPAN: ${{ inputs.time_span_days }}
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
@@ -89,7 +79,7 @@ jobs:
if: inputs.backend_comparison == false
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
with:
name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_subset_${{inputs.bench_subset}}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
# This will upload all the file generated
path: ${{ inputs.output_filename }}*.svg
retention-days: 60

View File

@@ -51,7 +51,7 @@ jobs:
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
cpu-integer-throughput-table:
name: generate_documentation_svgs/cpu-integer-throughput-table
name: generate_documentation_svgs/cpu-integer-latency-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-cpu-svgs
with:
@@ -150,124 +150,6 @@ jobs:
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
# -----------------------------------------------------------
# ZK benchmarks tables
# -----------------------------------------------------------
cpu-zk-server-latency-table:
name: generate_documentation_svgs/cpu-zk-server-latency-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-cpu-svgs
with:
backend: cpu
hardware_name: hpc7a.96xlarge
layer: integer
bench_subset: zk
pbs_kind: classical
bench_type: latency
time_span_days: ${{ inputs.time_span_days }}
output_filename: cpu-zk-benchmark-latency
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
cpu-zk-server-throughput-table:
name: generate_documentation_svgs/cpu-zk-server-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-cpu-svgs
with:
backend: cpu
hardware_name: hpc7a.96xlarge
layer: integer
bench_subset: zk
pbs_kind: classical
bench_type: throughput
time_span_days: ${{ inputs.time_span_days }}
output_filename: cpu-zk-benchmark-throughput
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
cpu-zk-client-latency-table:
name: generate_documentation_svgs/cpu-zk-client-latency-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-cpu-svgs
with:
backend: cpu
hardware_name: m6i.4xlarge
layer: wasm
bench_subset: zk
pbs_kind: classical
bench_type: latency
name_suffix: _chrome_mean
time_span_days: ${{ inputs.time_span_days }}
output_filename: cpu-zk-wasm-benchmark-latency
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
# -----------------------------------------------------------
# ERC20 benchmarks tables
# -----------------------------------------------------------
cpu-erc20-latency-throughput-table:
name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-cpu-svgs
with:
backend: cpu
hardware_name: hpc7a.96xlarge
layer: hlapi
bench_subset: erc20
pbs_kind: classical
bench_type: both
time_span_days: ${{ inputs.time_span_days }}
output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
gpu-erc20-latency-throughput-table:
name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-gpu-svgs
with:
backend: gpu
hardware_name: n3-H100-SXM5x8
layer: hlapi
bench_subset: erc20
pbs_kind: multi_bit
grouping_factor: 4
bench_type: both
time_span_days: ${{ inputs.time_span_days }}
output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
hpu-erc20-latency-throughput-table:
name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-hpu-svgs
with:
backend: hpu
hardware_name: hpu_x1
layer: hlapi
bench_subset: erc20
pbs_kind: classical
bench_type: both
time_span_days: ${{ inputs.time_span_days }}
output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
# -----------------------------------------------------------
# PBS benchmarks tables
# -----------------------------------------------------------

View File

@@ -93,11 +93,6 @@ jobs:
- name: Find tools
run: |
# Disable unattended-upgrades to avoid lock issues
sudo systemctl disable --now unattended-upgrades
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/*
sudo apt update && sudo apt install -y valgrind
find /usr -executable -name "compute-sanitizer"
which valgrind
@@ -111,10 +106,6 @@ jobs:
run: |
make test_high_level_api_gpu_valgrind
- name: Run CUDA backend racecheck tests
run: |
make test_cuda_backend_race_check
slack-notify:
name: gpu_code_validation_tests/slack-notify
needs: [ setup-instance, cuda-tests-linux ]

View File

@@ -90,12 +90,6 @@ jobs:
echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
sha256sum -c checksum
sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
# Disable unattended-upgrades to avoid lock issues
sudo systemctl disable --now unattended-upgrades
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/*
sudo apt update
sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
env:
@@ -138,13 +132,7 @@ jobs:
- name: Run semgrep and lint checks on CUDA code
run: |
# Disable unattended-upgrades to avoid lock issues
sudo systemctl disable --now unattended-upgrades
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/*
sudo apt update
sudo apt -y install python3-venv
sudo apt update && sudo apt -y install python3-venv
make semgrep_and_lint_gpu_code
- name: Check build with hpu enabled

View File

@@ -51,12 +51,7 @@ jobs:
with:
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/zk-cuda-backend/**
- tfhe/src/integer/gpu/zk/**
- tfhe-zk-pok/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_zk_tests.yml'
- ci/slab.toml
@@ -131,9 +126,6 @@ jobs:
- name: Run zk-cuda-backend integration tests
run: |
make test_zk_cuda_backend
make test_zk_pok_gpu
make test_integer_zk_gpu
make test_integer_zk_experimental_gpu
slack-notify:
name: gpu_zk_tests/slack-notify

1
.gitignore vendored
View File

@@ -25,7 +25,6 @@ dieharder_run.log
# Cuda local build
backends/tfhe-cuda-backend/cuda/cmake-build-debug/
backends/tfhe-cuda-backend/cuda/build/
# WASM tests
tfhe/web_wasm_parallel_tests/server.PID

View File

@@ -17,7 +17,7 @@ Start by [forking](https://docs.github.com/en/pull-requests/collaborating-with-p
- **Performance**: For optimal performance, it is highly recommended to run **TFHE-rs** code in release mode with cargo's `--release` flag.
{% endhint %}
To get more details about the library, please refer to the [documentation](https://docs.zama.org/tfhe-rs).
To get more details about the library, please refer to the [documentation](https://docs.zama.ai/tfhe-rs).
## 2. Creating a new branch

View File

@@ -14,7 +14,6 @@ members = [
"utils/tfhe-versionable",
"utils/tfhe-versionable-derive",
"utils/tfhe-backward-compat-data",
"utils/tfhe-backward-compat-data/crates/add_new_version",
"utils/param_dedup",
"tests",
"mockups/tfhe-hpu-mockup",
@@ -45,7 +44,6 @@ bindgen = "0.71"
bincode = "=1.3.3"
cmake = "0.1"
pkg-config = "0.3"
clap = { version = "4.5", features = ["derive"] }
[profile.bench]
lto = "fat"

119
Makefile
View File

@@ -1,7 +1,4 @@
SHELL:=$(shell /usr/bin/env which bash)
# Enable stop on error, no undefined variables
# the c flag is to run the script inline
.SHELLFLAGS := -eu -c
OS:=$(shell uname)
RS_CHECK_TOOLCHAIN:=$(shell cat nightly-toolchain.txt | tr -d '\n')
CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
@@ -270,23 +267,12 @@ install_mlc:
cargo install mlc --locked || \
( echo "Unable to install mlc, unknown error." && exit 1 )
fmt: FMT_CHECK =
.PHONY: fmt # Format rust code
fmt: fmt_internal
check_fmt: FMT_CHECK = --check
.PHONY: check_fmt # Check rust code format
check_fmt: fmt_internal
.PHONY: fmt_internal # internal recipe for fmt
fmt_internal: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt $(FMT_CHECK)
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt $(FMT_CHECK)
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt $(FMT_CHECK)
for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
echo "fmt $$crate"; \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate fmt $(FMT_CHECK); \
done
fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt
.PHONY: fmt_js # Format javascript code
fmt_js: check_nvm_installed
@@ -323,6 +309,13 @@ fmt_c_tests:
fmt_toml: install_taplo
taplo fmt
.PHONY: check_fmt # Check rust code format
check_fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check
.PHONY: check_fmt_c_tests # Check C tests format
check_fmt_c_tests:
find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
@@ -353,14 +346,14 @@ check_typos: install_typos_checker
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
--all-targets \
-p tfhe -- --no-deps -D warnings
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
check_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
--all-targets \
-p tfhe
@@ -374,7 +367,7 @@ clippy_hpu: install_rs_check_toolchain
.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
clippy_gpu_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
--all-targets \
-p tfhe -- --no-deps -D warnings
@@ -467,7 +460,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
fi && \
CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
-p tfhe -- --nocapture
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -543,10 +536,11 @@ clippy_param_dedup: install_rs_check_toolchain
.PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-backward-compat-data -- --no-deps -D warnings
@# Some old crates are x86 specific, only run in that case
@if uname -a | grep -q x86; then \
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all --all-targets \
-- --no-deps -D warnings; \
for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
echo "checking $$crate"; \
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
@@ -670,7 +664,7 @@ build_c_api: install_rs_check_toolchain
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
build_c_api_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
-p tfhe
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -748,15 +742,6 @@ test_cuda_backend:
"$(MAKE)" -j "$(CPU_COUNT)" && \
"$(MAKE)" test
.PHONY: test_cuda_backend_race_check # Build and run selected CUDA backend tests with Compute Sanitizer racecheck
test_cuda_backend_race_check:
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
"$(MAKE)" -j "$(CPU_COUNT)" test_tfhe_cuda_backend && \
compute-sanitizer --tool racecheck --target-processes all ./tests_and_benchmarks/tests/test_tfhe_cuda_backend \
--gtest_filter="*ClassicalProgrammableBootstrap*:*MultiBitProgrammableBootstrap*"
.PHONY: test_zk_cuda_backend # Run the internal tests of the CUDA ZK backend
test_zk_cuda_backend:
mkdir -p "$(ZKCUDA_BUILD)" && \
@@ -769,7 +754,7 @@ test_zk_cuda_backend:
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_core_crypto_gpu:
@@ -809,7 +794,7 @@ test_integer_hl_test_gpu_check_warnings:
--features=integer,internal-keycache,gpu-debug,zk-pok -vv -p tfhe &> /tmp/gpu_compile_output
WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning #" | grep "\[tfhe-cuda-backend" | grep -v "inline qualifier" || true) && \
if [[ "$${WARNINGS}" != "" ]]; then \
echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
echo "$${WARNINGS}" && exit 1; \
fi
@@ -1205,31 +1190,12 @@ test_tfhe_csprng_big_endian: install_cargo_cross
RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
test_zk_pok:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok --features experimental
.PHONY: test_zk_pok_gpu # Run tfhe-zk-pok GPU-accelerated tests
test_zk_pok_gpu:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
test_integer_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
--features=integer,zk-pok,gpu -p tfhe -- \
integer::gpu::zk::
.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
test_integer_zk_experimental_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
integer::gpu::zk::
.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
test_zk_cuda: install_rs_check_toolchain test_zk_cuda_backend test_zk_pok_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
.PHONY: test_zk_wasm_x86_compat_ci
test_zk_wasm_x86_compat_ci: check_nvm_installed
source ~/.nvm/nvm.sh && \
@@ -1522,47 +1488,27 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain
--bench glwe_packing_compression_128b-integer-bench \
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
.PHONY: bench_msm_zk
bench_msm_zk: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench zk-msm \
--features=zk-pok -p tfhe-benchmark --profile release --
.PHONY: bench_msm_zk_gpu
bench_msm_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench zk-msm \
--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release --
.PHONY: bench_integer_zk_gpu
bench_integer_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
.PHONY: bench_integer_zk_experimental_gpu
bench_integer_zk_experimental_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
bench_integer_aes_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-aes \
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
bench_integer_aes256_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-aes256 \
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1799,14 +1745,14 @@ bench_hlapi_erc20: install_rs_check_toolchain
.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
bench_hlapi_erc20_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1845,13 +1791,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--package tfhe-zk-pok \
--features=gpu-experimental --profile release
.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
bench_hlapi_noise_squash: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \

View File

@@ -10,7 +10,7 @@
<hr/>
<p align="center">
<a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.org/tfhe-rs"> 📒 Documentation</a> | <a href="https://www.zama.org/community-channels"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
<a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
</p>
@@ -47,7 +47,7 @@ production-ready library for all the advanced features of TFHE.
- **Ciphertext and server key compression** for efficient data transfer
- **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.
*Learn more about TFHE-rs features in the [documentation](https://docs.zama.org/tfhe-rs).*
*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
<br></br>
## Table of Contents
@@ -149,7 +149,7 @@ To run this code, use the following command:
> Note that when running code that uses `TFHE-rs`, it is highly recommended
to run in release mode with cargo's `--release` flag to have the best performance possible.
*Find an example with more explanations in [this part of the documentation](https://docs.zama.org/tfhe-rs/get-started/quick-start)*
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
<p align="right">
<a href="#about" > ↑ Back to top </a>
@@ -163,25 +163,25 @@ to run in release mode with cargo's `--release` flag to have the best performanc
A document containing scientific and technical details about algorithms implemented into the library is available here: [TFHE-rs: A (Practical) Handbook](https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf).
### TFHE deep dive
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.org/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.org/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.org/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.org/post/tfhe-deep-dive-part-4)
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
<br></br>
### Tutorials
- [Video tutorial: Implement signed integers using TFHE-rs](https://www.zama.org/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
- [Homomorphic parity bit](https://docs.zama.org/tfhe-rs/tutorials/parity-bit)
- [Homomorphic case changing on Ascii string](https://docs.zama.org/tfhe-rs/tutorials/ascii-fhe-string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.org/post/boolean-sha256-tfhe-rs)
- [Dark market with TFHE-rs](https://www.zama.org/post/dark-market-tfhe-rs)
- [Regular expression engine with TFHE-rs](https://www.zama.org/post/regex-engine-tfhe-rs)
- [[Video tutorial] Implement signed integers using TFHE-rs ](https://www.zama.ai/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
- [Homomorphic parity bit](https://docs.zama.ai/tfhe-rs/tutorials/parity-bit)
- [Homomorphic case changing on Ascii string](https://docs.zama.ai/tfhe-rs/tutorials/ascii-fhe-string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
- [Dark market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
- [Regular expression engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.org/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
<br></br>
### Documentation
Full, comprehensive documentation is available here: [https://docs.zama.org/tfhe-rs](https://docs.zama.org/tfhe-rs).
Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
<p align="right">
<a href="#about" > ↑ Back to top </a>
</p>
@@ -202,7 +202,7 @@ When a new update is published in the Lattice Estimator, we update parameters ac
### Security model
By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.org/tfhe-rs).
If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).
[1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf
@@ -231,7 +231,7 @@ To cite TFHE-rs in academic papers, please use the following entry:
There are two ways to contribute to TFHE-rs:
- [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
- Request to become an official contributor by emailing [hello@zama.org](mailto:hello@zama.org).
- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
<br></br>
@@ -243,16 +243,16 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
**Is Zamas technology free to use?**
>Zamas libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zamas commercial patent license.
>
>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.org/post/open-source).
>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).
**What do I need to do if I want to use Zamas technology for commercial purposes?**
>To commercially use Zamas technology you need to be granted Zamas patent license. Please contact us hello@zama.org for more information.
>To commercially use Zamas technology you need to be granted Zamas patent license. Please contact us hello@zama.ai for more information.
**Do you file IP on your technology?**
>Yes, all Zamas technologies are patented.
**Can you customize a solution for my specific use case?**
>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.org.
>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.ai.
<p align="right">
<a href="#about" > ↑ Back to top </a>
@@ -261,7 +261,7 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
## Support
<a target="_blank" href="https://community.zama.org">
<a target="_blank" href="https://community.zama.ai">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/08656d0a-3f44-4126-b8b6-8c601dff5380">
<source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/1c9c9308-50ac-4aab-a4b9-469bb8c536a4">

View File

@@ -5,8 +5,8 @@ edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"
description = "Cuda implementation of TFHE-rs primitives."
homepage = "https://www.zama.org/"
documentation = "https://docs.zama.org/tfhe-rs"
homepage = "https://www.zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
readme = "README.md"
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

View File

@@ -51,4 +51,4 @@ If your machine does not have an available Nvidia GPU, the compilation will work
## License
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
please contact us at `hello@zama.org`.
please contact us at `hello@zama.ai`.

View File

@@ -14,7 +14,6 @@ template <typename Torus> struct int_are_all_block_true_buffer {
// of interest in are_all_block_true(), as with max_value (the maximum message
// value).
int_radix_lut<Torus> *is_max_value;
Torus *preallocated_h_lut;
bool gpu_memory_allocated;
int_are_all_block_true_buffer(CudaStreams streams, COMPARISON_TYPE op,
@@ -40,10 +39,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
max_chunks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
params.glwe_dimension + 1, params.polynomial_size));
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
is_max_value = new int_radix_lut<Torus>(streams, params, 1, max_chunks,
allocate_gpu_memory, size_tracker);
auto active_streams =
@@ -67,7 +63,6 @@ template <typename Torus> struct int_are_all_block_true_buffer {
delete tmp_out;
delete tmp_block_accumulated;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(preallocated_h_lut);
}
};

View File

@@ -823,6 +823,10 @@ private:
generate_lut_indexes<InputTorus>(streams, generator, get_lut_indexes(0, 0),
num_indexes, num_luts, index_buffer,
gpu_memory_allocated);
if (h_buffer != nullptr) {
memcpy(h_lut_indexes, h_buffer, num_indexes * sizeof(h_lut_indexes[0]));
}
}
/// Sets all LUT indexes to a constant value on both CPU and GPU.
@@ -881,6 +885,39 @@ public:
broadcast_lut(streams, false);
}
// TODO: add comment
template <typename IndexGenerator>
void prepare_to_apply_to_block_subset(uint32_t num_radix_blocks_subset,
IndexGenerator generator) {
// TODO: add comment
GPU_ASSERT(num_radix_blocks_subset <= num_blocks,
"num_radix_blocks_subset (%u) must not exceed num_blocks (%u)",
num_radix_blocks_subset, num_blocks);
if constexpr (!std::is_same_v<IndexGenerator, std::nullptr_t>) {
// TODO: add comment
std::vector<InputTorus> expected(num_blocks);
generator(expected.data(), num_blocks);
for (uint32_t i = 0; i < num_blocks; i++) {
// TODO: add comment
GPU_ASSERT(expected[i] == h_lut_indexes[i],
"LUT index mismatch at block %u: expected %llu, stored %llu",
i, (unsigned long long)expected[i],
(unsigned long long)h_lut_indexes[i]);
}
} else {
// TODO: add comment
for (uint32_t i = 0; i < num_blocks; i++) {
GPU_ASSERT(h_lut_indexes[i] == 0,
"LUT index mismatch at block %u: expected 0, stored %llu", i,
(unsigned long long)h_lut_indexes[i]);
}
}
// TODO: add comment
last_broadcast_num_radix_blocks = num_radix_blocks_subset;
}
// Broadcast luts from device gpu_indexes[0] to all active gpus
void broadcast_lut(CudaStreams new_active_streams,
bool broadcast_lut_values = true) {
@@ -1268,6 +1305,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
bool gpu_memory_allocated;
uint32_t bits_per_block = 0;
uint32_t num_radix_blocks = 0;
// With offset
int_bit_extract_luts_buffer(CudaStreams streams, int_radix_params params,
@@ -1277,6 +1316,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
uint64_t &size_tracker) {
this->params = params;
gpu_memory_allocated = allocate_gpu_memory;
this->bits_per_block = bits_per_block;
this->num_radix_blocks = num_radix_blocks;
lut = new int_radix_lut<Torus>(streams, params, bits_per_block,
bits_per_block * num_radix_blocks,
@@ -1303,10 +1344,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
auto lut_index_generator =
[num_radix_blocks, bits_per_block](Torus *h_lut_indexes, uint32_t) {
for (int j = 0; j < num_radix_blocks; j++) {
for (int i = 0; i < bits_per_block; i++)
h_lut_indexes[i + j * bits_per_block] = i;
}
compute_bit_extract_lut_indexes(h_lut_indexes, num_radix_blocks,
bits_per_block);
};
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
@@ -1354,11 +1393,28 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
num_radix_blocks, allocate_gpu_memory,
size_tracker) {}
// TODO: add comment
void prepare_lut_for_blocks(uint32_t effective_num_blocks) {
auto generator = [this](Torus *h, uint32_t) {
compute_bit_extract_lut_indexes(h, num_radix_blocks, bits_per_block);
};
lut->prepare_to_apply_to_block_subset(effective_num_blocks, generator);
}
void release(CudaStreams streams) {
lut->release(streams);
delete (lut);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
private:
static void compute_bit_extract_lut_indexes(Torus *h,
uint32_t num_radix_blocks,
uint32_t bits_per_block) {
for (int j = 0; j < num_radix_blocks; j++)
for (int i = 0; i < bits_per_block; i++)
h[i + j * bits_per_block] = i;
}
};
template <typename Torus> struct int_fullprop_buffer {

View File

@@ -25,24 +25,39 @@ void cuda_convert_lwe_programmable_bootstrap_key_128_async(
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
uint64_t scratch_cuda_programmable_bootstrap_amortized_64_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cuda_programmable_bootstrap_amortized_64_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cleanup_cuda_programmable_bootstrap_amortized_64(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
uint64_t scratch_cuda_programmable_bootstrap_64_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_programmable_bootstrap_tbc_2_2_64_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_programmable_bootstrap_128_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -67,24 +82,6 @@ void cuda_programmable_bootstrap_64_async(
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_tbc_64_generic_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_tbc_64_2_2_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_128_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lut_vector, void const *lwe_array_in,

View File

@@ -25,16 +25,6 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64_async(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_2_2_64_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_multi_bit_programmable_bootstrap_64_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
@@ -45,26 +35,6 @@ void cuda_multi_bit_programmable_bootstrap_64_async(
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cuda_multi_bit_programmable_bootstrap_tbc_64_generic_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cuda_multi_bit_programmable_bootstrap_tbc_64_2_2_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);

View File

@@ -128,32 +128,22 @@ __host__ void are_all_comparisons_block_true(
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
} else {
if (chunk_lengths[num_chunks - 1] != max_value) {
// LUT needs to be computed
uint32_t chunk_length = chunk_lengths[num_chunks - 1];
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
return x == chunk_length;
};
auto num_blocks = is_max_value_lut->num_blocks;
auto active_streams =
streams.active_gpu_subset(num_chunks, params.pbs_type);
// Index generator: last chunk uses LUT 1, others use LUT 0
auto index_gen = [num_chunks, num_blocks](Torus *h_lut_indexes,
uint32_t) {
for (uint32_t index = 0; index < num_blocks; index++) {
if (index == num_chunks - 1) {
h_lut_indexes[index] = 1;
} else if (index < num_chunks - 1 || index >= num_chunks) {
h_lut_indexes[index] = 0;
}
}
};
is_max_value_lut->generate_and_broadcast_lut(
active_streams, {1}, {is_equal_to_num_blocks_lut_f}, index_gen,
true, {are_all_block_true_buffer->preallocated_h_lut});
// Pad the last chunk's accumulator block up to max_value by adding
// trivial plaintext 1s, so that all chunks can use the same
// is_max_value LUT uniformly.
uint32_t last_chunk_length = chunk_lengths[num_chunks - 1];
if (last_chunk_length != max_value) {
uint32_t pad = max_value - last_chunk_length;
GPU_ASSERT(pad < max_value,
"pad (%u) must be strictly less than max_value (%u)", pad,
max_value);
Torus delta = (Torus(1) << (sizeof(Torus) * 8 - 1)) /
(message_modulus * carry_modulus);
Torus *last_block_ptr = (Torus *)accumulator->ptr +
(num_chunks - 1) * (big_lwe_dimension + 1);
device_add_scalar_one_inplace<<<1, 1, 0, streams.stream(0)>>>(
last_block_ptr, 1, big_lwe_dimension, pad * delta);
check_cuda_error(cudaGetLastError());
}
lut = is_max_value_lut;
}
@@ -163,13 +153,6 @@ __host__ void are_all_comparisons_block_true(
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
// Reset max_value_lut_indexes before returning, otherwise if the lut is
// reused the lut indexes will be wrong
auto active_gpu_count_is_max = streams.active_gpu_subset(
is_max_value_lut->num_blocks, params.pbs_type);
is_max_value_lut->set_lut_indexes_and_broadcast_constant(
active_gpu_count_is_max, 0);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;
} else {

View File

@@ -546,7 +546,9 @@ __host__ void integer_radix_apply_univariate_lookup_table(
// Verify consistency between set_lut_indexes and apply_lookup_table
GPU_ASSERT(
num_radix_blocks <= lut->last_broadcast_num_radix_blocks,
lut->num_luts == 1
? num_radix_blocks <= lut->last_broadcast_num_radix_blocks
: num_radix_blocks == lut->last_broadcast_num_radix_blocks,
"num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
num_radix_blocks, lut->last_broadcast_num_radix_blocks);
GPU_ASSERT(active_streams.count() <= lut->last_broadcast_streams.count(),
@@ -655,6 +657,13 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input and output radix ciphertexts should have the same "
"lwe dimension")
GPU_ASSERT(
lut->num_luts == 1 ? lwe_array_in->num_radix_blocks <=
lut->last_broadcast_num_radix_blocks
: lwe_array_in->num_radix_blocks ==
lut->last_broadcast_num_radix_blocks,
"num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
lwe_array_in->num_radix_blocks, lut->last_broadcast_num_radix_blocks);
auto num_radix_blocks = lwe_array_in->num_radix_blocks;
/// For multi GPU execution we create vectors of pointers for inputs and
@@ -747,6 +756,12 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
if (num_radix_blocks > lut->num_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of lut radix blocks")
GPU_ASSERT(
lut->num_luts == 1
? num_radix_blocks <= lut->last_broadcast_num_radix_blocks
: num_radix_blocks == lut->last_broadcast_num_radix_blocks,
"num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
num_radix_blocks, lut->last_broadcast_num_radix_blocks);
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
num_radix_blocks > lwe_array_1->num_radix_blocks ||
num_radix_blocks > lwe_array_2->num_radix_blocks)
@@ -1376,10 +1391,18 @@ void host_resolve_group_carries_sequentially(
// Apply the lut
auto luts_sequential = mem->lut_sequential_algorithm;
auto lut_index_generator = [](Torus *h_lut_indexes,
uint32_t num_indexes) {
for (uint32_t i = 0; i < num_indexes; i++)
h_lut_indexes[i] = i;
};
luts_sequential->prepare_to_apply_to_block_subset(blocks_to_solve,
lut_index_generator);
CudaRadixCiphertextFFI shifted_group_resolved_carries;
as_radix_ciphertext_slice<Torus>(&shifted_group_resolved_carries,
group_resolved_carries, 1,
blocks_to_solve + 1);
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &shifted_group_resolved_carries,
&shifted_group_resolved_carries, bsks, ksks, luts_sequential,
@@ -1716,6 +1739,7 @@ extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
num_radix_blocks);
}
}
bit_extract->prepare_lut_for_blocks(effective_num_radix_blocks);
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
effective_num_radix_blocks);

View File

@@ -414,6 +414,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
// we just need to broadcast the indexes
luts_message_carry->broadcast_lut(active_streams, false);
luts_message_carry->prepare_to_apply_to_block_subset(
total_ciphertexts, LUT_0_FOR_ALL_BLOCKS);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table<Torus>(
@@ -467,6 +469,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
// we just need to broadcast the indexes
luts_message_carry->broadcast_lut(active_streams, false);
luts_message_carry->prepare_to_apply_to_block_subset(
num_blocks_in_apply_lut, LUT_0_FOR_ALL_BLOCKS);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table<Torus>(

View File

@@ -0,0 +1,317 @@
#include "programmable_bootstrap_amortized.cuh"
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 64 bits inputs, into `buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
uint64_t scratch_cuda_programmable_bootstrap_amortized_64_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
switch (polynomial_size) {
case 256:
size_tracker =
scratch_programmable_bootstrap_amortized<uint64_t,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
allocate_gpu_memory);
return size_tracker;
case 512:
size_tracker =
scratch_programmable_bootstrap_amortized<uint64_t,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
allocate_gpu_memory);
return size_tracker;
case 1024:
size_tracker =
scratch_programmable_bootstrap_amortized<uint64_t,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
allocate_gpu_memory);
return size_tracker;
case 2048:
size_tracker =
scratch_programmable_bootstrap_amortized<uint64_t,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
allocate_gpu_memory);
return size_tracker;
case 4096:
size_tracker =
scratch_programmable_bootstrap_amortized<uint64_t,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
allocate_gpu_memory);
return size_tracker;
case 8192:
size_tracker =
scratch_programmable_bootstrap_amortized<uint64_t,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
allocate_gpu_memory);
return size_tracker;
case 16384:
size_tracker =
scratch_programmable_bootstrap_amortized<uint64_t,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
allocate_gpu_memory);
return size_tracker;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
/* Perform the programmable bootstrapping on a batch of input u32 LWE
* ciphertexts. See the corresponding operation on 64 bits for more details.
*/
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
if (base_log > 32)
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
"the ciphertext representation (32)");
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 512:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 1024:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 2048:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 4096:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 8192:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 16384:
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
(uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
/* Perform the programmable bootstrapping on a batch of input u64 LWE
* ciphertexts. This functions performs best for large numbers of inputs (> 10).
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many luts of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_luts vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which lut of lut_vector to use for each LWE input in
* lwe_array_in
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the lut
* are the same because they have to be in the same ring
* to be multiplied.
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (lut) and size of the
* GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* used
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into level_count levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_programmable_bootstrap_amortized_64_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
if (base_log > 64)
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
"the ciphertext representation (64)");
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 512:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 1024:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 2048:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 4096:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 8192:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
case 16384:
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
num_samples);
break;
default:
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
/*
* This cleanup function frees the data for the amortized PBS on GPU in
* buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_programmable_bootstrap_amortized_64(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer) {
// Free memory
cuda_drop_async(*pbs_buffer, static_cast<cudaStream_t>(stream), gpu_index);
*pbs_buffer = nullptr;
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
}

View File

@@ -0,0 +1,371 @@
#ifndef CUDA_AMORTIZED_PBS_CUH
#define CUDA_AMORTIZED_PBS_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "bootstrapping_key.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "fft/twiddles.cuh"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
template <typename Torus, class params, sharedMemDegree SMD>
/*
* Kernel launched by host_programmable_bootstrap_amortized
*
* Uses shared memory to increase performance
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many luts of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_luts vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to which lut
* to use for each sample in lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
* key sk2
* - device_mem: pointer to the device's global memory in case we use it (SMD
* == NOSM or PARTIALSM)
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (lut) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
* is not FULLSM
*/
__global__ void device_programmable_bootstrap_amortized(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key, int8_t *device_mem,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
size_t device_memory_size_per_sample) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
// For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
// one mask polynomial and 1 body to handle.
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated =
(Torus *)accumulator +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
double2 *res_fft =
(double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
(sizeof(double2) / sizeof(Torus));
double2 *accumulator_fft = (double2 *)sharedmem;
if constexpr (SMD != PARTIALSM)
accumulator_fft = (double2 *)res_fft +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
auto block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
(glwe_dimension + 1)];
// Put "b", the body, in [0, 2N[
constexpr auto log_modulus = params::log2_degree + 1;
Torus b_hat = 0;
auto correction = centered_binary_modulus_switch_body_correction_to_add(
block_lwe_array_in, lwe_dimension, log_modulus);
modulus_switch(block_lwe_array_in[lwe_dimension] + correction, b_hat,
log_modulus);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
// Loop over all the mask elements of the sample to accumulate
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
// into level_count polynomials, and performing polynomial multiplication
// via an FFT with the RGSW encrypted secret key
for (int iteration = 0; iteration < lwe_dimension; iteration++) {
__syncthreads();
// Put "a" in [0, 2N[ instead of Zq
Torus a_hat = 0;
modulus_switch(block_lwe_array_in[iteration], a_hat, log_modulus);
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
__syncthreads();
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
init_decomposer_state_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count, glwe_dimension + 1);
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int i = 0; i < (glwe_dimension + 1); i++)
for (int j = 0; j < params::opt / 2; j++) {
res_fft[pos].x = 0;
res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
GadgetMatrix<Torus, params> gadget(base_log, level_count,
accumulator_rotated, glwe_dimension + 1);
// Now that the rotation is done, decompose the resulting polynomial
// coefficients so as to multiply each decomposed level with the
// corresponding part of the bootstrapping key
for (int level = level_count - 1; level >= 0; level--) {
for (int i = 0; i < (glwe_dimension + 1); i++) {
gadget.decompose_and_compress_next_polynomial(accumulator_fft, i);
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
level, polynomial_size,
glwe_dimension, level_count);
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key
for (int j = 0; j < (glwe_dimension + 1); j++) {
auto bsk_poly = bsk_slice + j * params::degree / 2;
auto res_fft_poly = res_fft + j * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
res_fft_poly, accumulator_fft, bsk_poly);
}
}
__syncthreads();
}
// Come back to the coefficient representation
if constexpr (SMD == FULLSM || SMD == NOSM) {
__syncthreads();
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto res_fft_slice = res_fft + i * params::degree / 2;
NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
}
__syncthreads();
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
}
__syncthreads();
} else {
#pragma unroll
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
int tid = threadIdx.x;
for (int j = 0; j < params::opt / 2; j++) {
accumulator_fft[tid] = res_fft_slice[tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
__syncthreads();
add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
}
__syncthreads();
}
}
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1)];
// The blind rotation for this block is over
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
// No need to sync here, it is already synchronized after add_to_torus
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
uint32_t polynomial_size, uint32_t glwe_dimension) {
return safe_mul_sizeof<Torus>((size_t)polynomial_size,
(size_t)(glwe_dimension + 1)) + // accumulator
safe_mul_sizeof<Torus>(
(size_t)polynomial_size,
(size_t)(glwe_dimension + 1)) + // accumulator rotated
safe_mul_sizeof<double2>(polynomial_size / 2) + // accumulator fft
safe_mul_sizeof<double2>((size_t)(polynomial_size / 2),
(size_t)(glwe_dimension + 1)); // res fft
}
template <typename Torus>
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
uint32_t polynomial_size) {
return safe_mul_sizeof<double2>(polynomial_size / 2); // accumulator fft
}
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count;
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count;
}
return device_mem + device_mem % sizeof(double2);
}
template <typename Torus, typename params>
__host__ uint64_t scratch_programmable_bootstrap_amortized(
cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
cuda_set_device(gpu_index);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
check_cuda_error(cudaFuncSetCacheConfig(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared));
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
check_cuda_error(cudaFuncSetCacheConfig(
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
}
uint64_t size_tracker = 0;
uint64_t buffer_size =
get_buffer_size_programmable_bootstrap_amortized<Torus>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
check_cuda_error(cudaGetLastError());
return size_tracker;
}
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_amortized(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: Programmable bootstrap amortized only supports 64-bit "
"Torus type.");
uint64_t SM_FULL =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t SM_PART =
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size);
uint64_t DM_PART = SM_FULL - SM_PART;
uint64_t DM_FULL = SM_FULL;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
// Create a 1-dimensional grid of threads
// where each block handles 1 sample and each thread
// handles opt polynomial coefficients
// (actually opt/2 coefficients since we compress the real polynomial into a
// complex)
dim3 grid(input_lwe_ciphertext_count, 1, 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
// Launch the kernel using polynomial_size/opt threads
// where each thread computes opt polynomial coefficients
// Depending on the required amount of shared memory, choose
// from one of three templates (no use, partial use or full use
// of shared memory)
if (max_shared_memory < SM_PART) {
device_programmable_bootstrap_amortized<Torus, params, NOSM>
<<<grid, thds, 0, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, DM_PART);
} else {
// For devices with compute capability 7.x a single thread block can
// address the full capacity of shared memory. Shared memory on the
// device then has to be allocated dynamically.
// For lower compute capabilities, this call
// just does nothing and the amount of shared memory used is 48 KB
device_programmable_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, 0);
}
check_cuda_error(cudaGetLastError());
}
#endif // CNCRT_PBS_H

View File

@@ -404,11 +404,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
execute_compute_keybundle_with_mode<Torus, params>(
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::GENERIC);
grouping_factor, level_count, lwe_offset);
// Accumulate
execute_cg_external_product_loop<Torus, params>(

View File

@@ -195,81 +195,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
" in the interval [256..16384].")
}
}
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector_generic(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap_tbc_generic<Torus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 8192:
host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 16384:
host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}
#endif
template <typename Torus>
@@ -408,46 +333,6 @@ uint64_t scratch_cuda_programmable_bootstrap_64_async(
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
}
uint64_t scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
#if (CUDA_ARCH >= 900)
return scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
#else
(void)stream;
(void)gpu_index;
(void)buffer;
(void)lwe_dimension;
(void)glwe_dimension;
(void)polynomial_size;
(void)level_count;
(void)input_lwe_ciphertext_count;
(void)allocate_gpu_memory;
(void)noise_reduction_type;
PANIC("Cuda error (classical PBS): TBC pbs is not supported.")
#endif
}
uint64_t scratch_cuda_programmable_bootstrap_tbc_2_2_64_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
PANIC_IF_FALSE(
polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1,
"Cuda error (classical PBS): specialized TBC 2_2 scratch requires "
"(N=2048, level_count=1, glwe_dimension=1).");
return scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
stream, gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
}
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
@@ -784,102 +669,6 @@ void cuda_programmable_bootstrap_64_async(
}
}
void cuda_programmable_bootstrap_tbc_64_generic_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
"Cuda error (classical PBS): expected a TBC buffer.");
#if (CUDA_ARCH >= 900)
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector_generic<uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
#else
(void)stream;
(void)gpu_index;
(void)lwe_array_out;
(void)lwe_output_indexes;
(void)lut_vector;
(void)lut_vector_indexes;
(void)lwe_array_in;
(void)lwe_input_indexes;
(void)bootstrapping_key;
(void)lwe_dimension;
(void)glwe_dimension;
(void)polynomial_size;
(void)level_count;
(void)num_samples;
(void)num_many_lut;
(void)lut_stride;
PANIC("Cuda error (classical PBS): TBC pbs is not supported.")
#endif
}
void cuda_programmable_bootstrap_tbc_64_2_2_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
PANIC_IF_FALSE(polynomial_size == 2048 && level_count == 1 &&
glwe_dimension == 1 && base_log == 23,
"Cuda error (classical PBS): specialized TBC 2_2 requires "
"(N=2048, level_count=1, glwe_dimension=1, base_log=23).");
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
"Cuda error (classical PBS): expected a TBC buffer.");
#if (CUDA_ARCH >= 900)
host_programmable_bootstrap_tbc_2_2_specialized<uint64_t, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
#else
(void)stream;
(void)gpu_index;
(void)lwe_array_out;
(void)lwe_output_indexes;
(void)lut_vector;
(void)lut_vector_indexes;
(void)lwe_array_in;
(void)lwe_input_indexes;
(void)bootstrapping_key;
(void)lwe_dimension;
(void)num_samples;
(void)num_many_lut;
(void)lut_stride;
PANIC("Cuda error (classical PBS): TBC pbs is not supported.")
#endif
}
/*
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
* bits inputs.

View File

@@ -3,7 +3,6 @@
#include "pbs/programmable_bootstrap_multibit.h"
#include "programmable_bootstrap_cg_multibit.cuh"
#include "programmable_bootstrap_multibit.cuh"
#include <type_traits>
#if (CUDA_ARCH >= 900)
#include "programmable_bootstrap_tbc_multibit.cuh"
@@ -221,17 +220,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
}
}
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride);
void cuda_multi_bit_programmable_bootstrap_64_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
@@ -295,159 +283,6 @@ void cuda_multi_bit_programmable_bootstrap_64_async(
}
}
void cuda_multi_bit_programmable_bootstrap_64_generic_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
PANIC_IF_FALSE(base_log <= 64,
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
base_log);
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
"Cuda error (multi-bit PBS): expected a TBC buffer.");
#if CUDA_ARCH >= 900
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic<
uint64_t>(stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
#else
(void)stream;
(void)gpu_index;
(void)lwe_array_out;
(void)lwe_output_indexes;
(void)lut_vector;
(void)lut_vector_indexes;
(void)lwe_array_in;
(void)lwe_input_indexes;
(void)bootstrapping_key;
(void)lwe_dimension;
(void)glwe_dimension;
(void)polynomial_size;
(void)grouping_factor;
(void)level_count;
(void)num_samples;
(void)num_many_lut;
(void)lut_stride;
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
}
void cuda_multi_bit_programmable_bootstrap_tbc_64_2_2_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
PANIC_IF_FALSE(polynomial_size == 2048 && grouping_factor == 4 &&
level_count == 1 && glwe_dimension == 1 && base_log == 22,
"Cuda error (multi-bit PBS): specialized TBC 2_2 requires "
"(N=2048, grouping_factor=4, level_count=1, glwe_dimension=1, "
"base_log=22).");
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
"Cuda error (multi-bit PBS): expected a TBC buffer.");
#if CUDA_ARCH >= 900
host_tbc_multi_bit_programmable_bootstrap_2_2_specialized<uint64_t,
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
#else
(void)stream;
(void)gpu_index;
(void)lwe_array_out;
(void)lwe_output_indexes;
(void)lut_vector;
(void)lut_vector_indexes;
(void)lwe_array_in;
(void)lwe_input_indexes;
(void)bootstrapping_key;
(void)lwe_dimension;
(void)num_samples;
(void)num_many_lut;
(void)lut_stride;
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
}
void cuda_multi_bit_programmable_bootstrap_tbc_64_generic_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
PANIC_IF_FALSE(base_log <= 64,
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
base_log);
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
"Cuda error (multi-bit PBS): expected a TBC buffer.");
#if CUDA_ARCH >= 900
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic<
uint64_t>(stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
#else
(void)stream;
(void)gpu_index;
(void)lwe_array_out;
(void)lwe_output_indexes;
(void)lut_vector;
(void)lut_vector_indexes;
(void)lwe_array_in;
(void)lwe_input_indexes;
(void)bootstrapping_key;
(void)lwe_dimension;
(void)glwe_dimension;
(void)polynomial_size;
(void)grouping_factor;
(void)level_count;
(void)num_samples;
(void)num_many_lut;
(void)lut_stride;
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
}
template <typename Torus>
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
@@ -601,41 +436,6 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64_async(
input_lwe_ciphertext_count, allocate_gpu_memory);
}
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
#if CUDA_ARCH >= 900
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
#else
(void)stream;
(void)gpu_index;
(void)buffer;
(void)glwe_dimension;
(void)polynomial_size;
(void)level_count;
(void)input_lwe_ciphertext_count;
(void)allocate_gpu_memory;
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
}
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_2_2_64_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
PANIC_IF_FALSE(
polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1,
"Cuda error (multi-bit PBS): specialized TBC 2_2 scratch requires "
"(N=2048, level_count=1, glwe_dimension=1).");
return scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
stream, gpu_index, buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
}
void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
uint32_t gpu_index,
int8_t **buffer) {
@@ -843,9 +643,6 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
static_assert(std::is_same_v<Torus, uint64_t>,
"Cuda error (multi-bit PBS): TBC path currently supports only "
"uint64_t torus.");
if (base_log > 32)
PANIC("Cuda error (multi-bit PBS): base log should be <= 32")
@@ -928,108 +725,6 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
}
}
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
static_assert(std::is_same_v<Torus, uint64_t>,
"Cuda error (multi-bit PBS): TBC path currently supports only "
"uint64_t torus.");
if (base_log > 32)
PANIC("Cuda error (multi-bit PBS): base log should be <= 32")
switch (polynomial_size) {
case 256:
host_tbc_multi_bit_programmable_bootstrap_generic<uint64_t,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 2048: {
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_samples, num_many_lut, lut_stride);
else
host_tbc_multi_bit_programmable_bootstrap_generic<Torus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_samples, num_many_lut, lut_stride);
break;
}
case 4096:
host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 8192:
host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 16384:
host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,

View File

@@ -658,20 +658,13 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
return size_tracker;
}
enum class MultiBitKeybundleLaunchMode {
AUTO,
GENERIC,
SPECIALIZED_2_2,
};
template <typename Torus, class params>
__host__ void execute_compute_keybundle_with_mode(
__host__ void execute_compute_keybundle(
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset,
MultiBitKeybundleLaunchMode launch_mode) {
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
cuda_set_device(gpu_index);
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: PBS keybundle only supports 64-bit "
@@ -698,9 +691,6 @@ __host__ void execute_compute_keybundle_with_mode(
dim3 thds(polynomial_size / params::opt, 1, 1);
if (max_shared_memory < full_sm_keybundle) {
PANIC_IF_FALSE(launch_mode != MultiBitKeybundleLaunchMode::SPECIALIZED_2_2,
"Cuda error (multi-bit PBS): specialized keybundle 2_2 "
"requires FULLSM.");
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>
<<<grid_keybundle, thds, 0, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
@@ -713,21 +703,8 @@ __host__ void execute_compute_keybundle_with_mode(
num_samples, glwe_dimension, polynomial_size, level_count,
cuda_get_max_shared_memory(gpu_index));
bool can_use_specialized = supports_tbc && polynomial_size == 2048 &&
grouping_factor == 4 && level_count == 1 &&
glwe_dimension == 1;
if (launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2) {
PANIC_IF_FALSE(
can_use_specialized,
"Cuda error (multi-bit PBS): specialized keybundle 2_2 requires "
"(N=2048, grouping_factor=4, level_count=1, glwe_dimension=1).");
}
bool use_specialized =
launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
(launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
can_use_specialized);
if (use_specialized) {
if (supports_tbc && polynomial_size == 2048 && grouping_factor == 4 &&
level_count == 1 && glwe_dimension == 1) {
dim3 thds_new_keybundle(512, 1, 1);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
@@ -755,48 +732,6 @@ __host__ void execute_compute_keybundle_with_mode(
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
__host__ void execute_compute_keybundle(
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
execute_compute_keybundle_with_mode<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::AUTO);
}
template <typename Torus, class params>
__host__ void execute_compute_keybundle_generic(
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
execute_compute_keybundle_with_mode<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::GENERIC);
}
template <typename Torus, class params>
__host__ void execute_compute_keybundle_2_2_specialized(
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
execute_compute_keybundle_with_mode<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
}
template <typename Torus, class params, bool is_first_iter>
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
@@ -916,11 +851,10 @@ __host__ void host_multi_bit_programmable_bootstrap(
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
execute_compute_keybundle_with_mode<Torus, params>(
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::GENERIC);
grouping_factor, level_count, lwe_offset);
// Accumulate
uint32_t chunk_size =
std::min((uint32_t)lwe_chunk_size,

View File

@@ -22,12 +22,6 @@
using namespace cooperative_groups;
namespace cg = cooperative_groups;
enum class ClassicalTbcLaunchMode {
AUTO, // Heuristic-based selection based on parameters
GENERIC, // Force-fallback to the generic implementation
SPECIALIZED_2_2, // Force-select the 2.2 specialized variant
};
/*
* Kernel that computes the classical PBS using cooperative groups
*
@@ -461,7 +455,7 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
* Host wrapper
*/
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_tbc_with_mode(
__host__ void host_programmable_bootstrap_tbc(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
@@ -469,8 +463,7 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_many_lut, uint32_t lut_stride,
ClassicalTbcLaunchMode launch_mode) {
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
PANIC_IF_FALSE(sizeof(Torus) == 8,
@@ -521,9 +514,6 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
config.stream = stream;
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
PANIC_IF_FALSE(
launch_mode != ClassicalTbcLaunchMode::SPECIALIZED_2_2,
"Cuda error (classical PBS): specialized TBC 2_2 requires FULLSM.");
config.dynamicSmemBytes = minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
@@ -533,9 +523,6 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
supports_dsm, num_many_lut, lut_stride, noise_reduction_type));
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
PANIC_IF_FALSE(
launch_mode != ClassicalTbcLaunchMode::SPECIALIZED_2_2,
"Cuda error (classical PBS): specialized TBC 2_2 requires FULLSM.");
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
@@ -546,19 +533,8 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
partial_dm, supports_dsm, num_many_lut, lut_stride,
noise_reduction_type));
} else {
bool can_use_specialized = polynomial_size == 2048 && level_count == 1 &&
glwe_dimension == 1 && base_log == 23;
if (launch_mode == ClassicalTbcLaunchMode::SPECIALIZED_2_2) {
PANIC_IF_FALSE(can_use_specialized,
"Cuda error (classical PBS): specialized TBC 2_2 requires "
"(N=2048, level_count=1, glwe_dimension=1, base_log=23).");
}
bool use_specialized =
launch_mode == ClassicalTbcLaunchMode::SPECIALIZED_2_2 ||
(launch_mode == ClassicalTbcLaunchMode::AUTO && can_use_specialized);
if (use_specialized) {
if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1 &&
base_log == 23) {
uint64_t full_sm_2_2 =
get_buffer_size_full_sm_programmable_bootstrap_tbc_2_2_params<Torus>(
polynomial_size);
@@ -594,60 +570,6 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
}
}
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_tbc(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_many_lut, uint32_t lut_stride) {
host_programmable_bootstrap_tbc_with_mode<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, input_lwe_ciphertext_count, num_many_lut, lut_stride,
ClassicalTbcLaunchMode::AUTO);
}
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_tbc_generic(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_many_lut, uint32_t lut_stride) {
host_programmable_bootstrap_tbc_with_mode<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, input_lwe_ciphertext_count, num_many_lut, lut_stride,
ClassicalTbcLaunchMode::GENERIC);
}
template <typename Torus, class params>
__host__ void host_programmable_bootstrap_tbc_2_2_specialized(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_many_lut, uint32_t lut_stride) {
host_programmable_bootstrap_tbc_with_mode<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
level_count, input_lwe_ciphertext_count, num_many_lut, lut_stride,
ClassicalTbcLaunchMode::SPECIALIZED_2_2);
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(

View File

@@ -18,12 +18,6 @@
#include "types/complex/operations.cuh"
#include <vector>
enum class MultiBitTbcLaunchMode {
AUTO, // Heuristic-based selection based on parameters
GENERIC, // Force-fallback to the generic implementation
SPECIALIZED_2_2, // Force-select the 2.2 specialized variant
};
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_tbc_accumulate(
@@ -536,7 +530,7 @@ __host__ void execute_tbc_external_product_loop(
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
uint32_t lut_stride, MultiBitTbcLaunchMode launch_mode) {
uint32_t lut_stride) {
PANIC_IF_FALSE(
sizeof(Torus) == 8,
@@ -596,9 +590,6 @@ __host__ void execute_tbc_external_product_loop(
config.stream = stream;
if (max_shared_memory < partial_dm + minimum_dm) {
PANIC_IF_FALSE(
launch_mode != MultiBitTbcLaunchMode::SPECIALIZED_2_2,
"Cuda error (multi-bit PBS): specialized TBC 2_2 requires FULLSM.");
config.dynamicSmemBytes = minimum_dm;
check_cuda_error(cudaLaunchKernelEx(
&config,
@@ -611,9 +602,6 @@ __host__ void execute_tbc_external_product_loop(
keybundle_size_per_input, d_mem, full_dm, supports_dsm, num_many_lut,
lut_stride));
} else if (max_shared_memory < full_dm + minimum_dm) {
PANIC_IF_FALSE(
launch_mode != MultiBitTbcLaunchMode::SPECIALIZED_2_2,
"Cuda error (multi-bit PBS): specialized TBC 2_2 requires FULLSM.");
config.dynamicSmemBytes = partial_dm + minimum_dm;
check_cuda_error(cudaLaunchKernelEx(
&config,
@@ -627,21 +615,8 @@ __host__ void execute_tbc_external_product_loop(
lut_stride));
} else {
config.dynamicSmemBytes = full_dm + minimum_dm;
bool can_use_specialized = polynomial_size == 2048 &&
grouping_factor == 4 && level_count == 1 &&
glwe_dimension == 1 && base_log == 22;
if (launch_mode == MultiBitTbcLaunchMode::SPECIALIZED_2_2) {
PANIC_IF_FALSE(
can_use_specialized,
"Cuda error (multi-bit PBS): specialized TBC 2_2 requires "
"(N=2048, grouping_factor=4, level_count=1, glwe_dimension=1, "
"base_log=22).");
}
bool use_specialized =
launch_mode == MultiBitTbcLaunchMode::SPECIALIZED_2_2 ||
(launch_mode == MultiBitTbcLaunchMode::AUTO && can_use_specialized);
if (use_specialized) {
if (polynomial_size == 2048 && grouping_factor == 4 && level_count == 1 &&
glwe_dimension == 1 && base_log == 22) {
config.dynamicSmemBytes = full_dm + 2 * minimum_dm;
check_cuda_error(cudaFuncSetAttribute(
@@ -690,8 +665,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride,
MultiBitTbcLaunchMode launch_mode) {
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -699,27 +673,10 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
switch (launch_mode) {
case MultiBitTbcLaunchMode::GENERIC:
execute_compute_keybundle_generic<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
break;
case MultiBitTbcLaunchMode::SPECIALIZED_2_2:
execute_compute_keybundle_2_2_specialized<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
break;
case MultiBitTbcLaunchMode::AUTO:
default:
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
break;
}
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
// Accumulate
execute_tbc_external_product_loop<Torus, params>(
@@ -727,64 +684,10 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride, launch_mode);
lut_stride);
}
}
template <typename Torus, class params>
__host__ void host_tbc_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
host_tbc_multi_bit_programmable_bootstrap<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride,
MultiBitTbcLaunchMode::AUTO);
}
template <typename Torus, class params>
__host__ void host_tbc_multi_bit_programmable_bootstrap_generic(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
host_tbc_multi_bit_programmable_bootstrap<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride,
MultiBitTbcLaunchMode::GENERIC);
}
template <typename Torus, class params>
__host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
host_tbc_multi_bit_programmable_bootstrap<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride,
MultiBitTbcLaunchMode::SPECIALIZED_2_2);
}
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory) {

View File

@@ -340,6 +340,28 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &buffer);
}
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS)
(benchmark::State &st) {
scratch_cuda_programmable_bootstrap_amortized_64_async(
stream, gpu_index, &buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, true);
for (auto _ : st) {
// Execute PBS
cuda_programmable_bootstrap_amortized_64_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in_array,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
input_lwe_ciphertext_count);
cuda_synchronize_stream(stream, gpu_index);
}
cleanup_cuda_programmable_bootstrap_amortized_64(stream, gpu_index, &buffer);
}
static void
MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
@@ -424,3 +446,8 @@ BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, CgPBS)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});

View File

@@ -4,7 +4,6 @@
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <pbs/pbs_utilities.h>
#include <setup_and_teardown.h>
#include <utils.h>
@@ -57,63 +56,6 @@ protected:
uint64_t *d_lwe_output_indexes;
uint64_t *lwe_ct_out_array;
void run_and_check_pbs(
const std::function<void(uint64_t *d_lwe_ct_in, double *d_fourier_bsk,
int8_t *pbs_buffer)> &run_pbs,
int8_t *pbs_buffer) {
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs) *
(lwe_dimension + 1));
run_pbs(d_lwe_ct_in, d_fourier_bsk, pbs_buffer);
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
ASSERT_NE(decrypted, plaintext);
uint64_t rounding_bit = delta >> 1;
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
ASSERT_EQ(decoded, plaintext / delta);
}
}
}
}
bool supports_classical_cg() const {
return has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, pbs_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index));
}
bool supports_classical_tbc() const {
return has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
number_of_inputs, glwe_dimension, polynomial_size, pbs_level,
cuda_get_max_shared_memory(gpu_index));
}
public:
// Test arithmetic functions
void SetUp() {
@@ -163,121 +105,132 @@ public:
}
};
TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64,
classical_auto_dispatch) {
pbs_buffer<uint64_t, CLASSICAL> *typed_buffer = nullptr;
scratch_cuda_programmable_bootstrap<uint64_t>(
stream, gpu_index, &typed_buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_level, number_of_inputs, true,
PBS_MS_REDUCTION_T::NO_REDUCTION);
int8_t *pbs_buffer = reinterpret_cast<int8_t *>(typed_buffer);
TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_programmable_bootstrap_amortized_64_async(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
number_of_inputs, true);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
cuda_programmable_bootstrap_64_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_programmable_bootstrap_amortized_64_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(
lwe_ct_out_array, d_lwe_ct_out_array,
safe_mul_sizeof<uint64_t>(
safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
(size_t)number_of_inputs),
stream, gpu_index);
cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s;
}
}
}
cleanup_cuda_programmable_bootstrap_amortized_64(stream, gpu_index,
&pbs_buffer);
}
TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_cg) {
if (!supports_classical_cg()) {
GTEST_SKIP() << "CG classical PBS is not supported on this architecture.";
}
pbs_buffer<uint64_t, CLASSICAL> *typed_buffer = nullptr;
scratch_cuda_programmable_bootstrap_cg<uint64_t>(
stream, gpu_index, &typed_buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_level, number_of_inputs, true,
PBS_MS_REDUCTION_T::NO_REDUCTION);
int8_t *pbs_buffer = reinterpret_cast<int8_t *>(typed_buffer);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
auto *typed =
reinterpret_cast<::pbs_buffer<uint64_t, CLASSICAL> *>(buffer);
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in,
d_lwe_input_indexes,
reinterpret_cast<const double2 *>(d_fourier_bsk), typed,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
}
TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_tbc) {
if (!supports_classical_tbc()) {
GTEST_SKIP() << "TBC classical PBS is not supported on this architecture.";
}
int8_t *pbs_buffer = nullptr;
scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_programmable_bootstrap_64_async(
stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_level, number_of_inputs, true,
PBS_MS_REDUCTION_T::NO_REDUCTION);
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
cuda_programmable_bootstrap_tbc_64_generic_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_programmable_bootstrap_64_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
// Copy result back
cuda_memcpy_async_to_cpu(
lwe_ct_out_array, d_lwe_ct_out_array,
safe_mul_sizeof<uint64_t>(
safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
(size_t)number_of_inputs),
stream, gpu_index);
cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
}
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
ASSERT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_tbc_2_2) {
if (!supports_classical_tbc()) {
GTEST_SKIP() << "TBC classical PBS is not supported on this architecture.";
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
ASSERT_EQ(decoded, plaintext / delta);
}
}
}
if (!(polynomial_size == 2048 && pbs_level == 1 && glwe_dimension == 1 &&
pbs_base_log == 23)) {
GTEST_SKIP()
<< "TBC specialized 2_2 requires N=2048, glwe=1, level=1, base_log=23.";
}
int8_t *pbs_buffer = nullptr;
scratch_cuda_programmable_bootstrap_tbc_2_2_64_async(
stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_level, number_of_inputs, true,
PBS_MS_REDUCTION_T::NO_REDUCTION);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
cuda_programmable_bootstrap_tbc_64_2_2_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
}
@@ -288,19 +241,16 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_tbc_2_2) {
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// message_modulus, carry_modulus, number_of_inputs, repetitions,
// samples
// V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128
// V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
(ClassicalProgrammableBootstrapTestParams){
879, 4, 512, new_t_uniform(46), new_t_uniform(17), 23, 1, 2, 2, 10,
1, 1},
// V1_6_PARAM_GPU_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128
918, 1, 2048, new_t_uniform(45), new_t_uniform(17), 23, 1, 4, 4,
100, 1, 1},
// V1_1_PARAM_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128
// This test is here only to be sure we don't break support to
// 8192-degree polynomials
(ClassicalProgrammableBootstrapTestParams){
759, 1, 2048, new_t_uniform(50), new_t_uniform(17), 23, 1, 2, 2, 10,
1, 1},
// V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
(ClassicalProgrammableBootstrapTestParams){
918, 1, 2048, new_t_uniform(45), new_t_uniform(17), 23, 1, 4, 4, 10,
1, 1});
1077, 1, 8192, new_t_uniform(41), new_t_uniform(3), 15, 2, 4, 4,
100, 1, 1});
std::string printParamName(
::testing::TestParamInfo<ClassicalProgrammableBootstrapTestParams> p) {
ClassicalProgrammableBootstrapTestParams params = p.param;

View File

@@ -6,8 +6,6 @@
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <pbs/pbs_multibit_utilities.h>
#include <pbs/programmable_bootstrap_multibit.h>
#include <setup_and_teardown.h>
#include <utils.h>
@@ -56,71 +54,11 @@ protected:
uint64_t *lwe_ct_out_array;
uint64_t *d_lwe_input_indexes;
uint64_t *d_lwe_output_indexes;
int8_t *pbs_buffer;
int repetitions;
int samples;
void run_and_check_pbs(
const std::function<void(uint64_t *d_lwe_ct_in, uint64_t *d_bsk,
int8_t *pbs_buffer)> &run_pbs,
int8_t *pbs_buffer) {
int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
(glwe_dimension + 1) * (glwe_dimension + 1) *
polynomial_size * (1 << grouping_factor);
for (int r = 0; r < repetitions; r++) {
uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs) *
(lwe_dimension + 1));
run_pbs(d_lwe_ct_in, d_bsk, pbs_buffer);
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
EXPECT_NE(decrypted, plaintext)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
uint64_t rounding_bit = delta >> 1;
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
}
}
}
}
bool supports_multibit_cg() const {
return has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
glwe_dimension, polynomial_size, pbs_level, number_of_inputs,
cuda_get_max_shared_memory(gpu_index));
}
bool supports_multibit_tbc() const {
return has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
number_of_inputs, glwe_dimension, polynomial_size, pbs_level,
cuda_get_max_shared_memory(gpu_index));
}
public:
void SetUp() {
stream = cuda_create_stream(gpu_index);
@@ -155,14 +93,20 @@ public:
pbs_base_log, pbs_level, message_modulus, carry_modulus,
&payload_modulus, &delta, number_of_inputs, repetitions, samples);
lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
scratch_cuda_multi_bit_programmable_bootstrap_64_async(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, true);
lwe_ct_out_array = (uint64_t *)malloc(safe_mul_sizeof<uint64_t>(
safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
(size_t)number_of_inputs));
}
void TearDown() {
free(lwe_ct_out_array);
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
&pbs_buffer);
programmable_bootstrap_multibit_teardown(
stream, gpu_index, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array,
plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
@@ -170,135 +114,104 @@ public:
}
};
TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_default) {
int8_t *pbs_buffer = nullptr;
scratch_cuda_multi_bit_programmable_bootstrap_64_async(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, true);
TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
multi_bit_programmable_bootstrap) {
int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
(glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
(1 << grouping_factor);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
auto *typed =
reinterpret_cast<::pbs_buffer<uint64_t, MULTI_BIT> *>(buffer);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in,
d_lwe_input_indexes, d_bsk, typed, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, pbs_base_log, pbs_level,
number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
for (int r = 0; r < repetitions; r++) {
uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_multi_bit_programmable_bootstrap_64_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
&pbs_buffer);
}
TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_cg) {
if (!supports_multibit_cg()) {
GTEST_SKIP() << "CG multibit PBS is not supported on this architecture.";
}
pbs_buffer<uint64_t, MULTI_BIT> *typed_buffer = nullptr;
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, &typed_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, true);
int8_t *pbs_buffer = reinterpret_cast<int8_t *>(typed_buffer);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
auto *typed =
reinterpret_cast<::pbs_buffer<uint64_t, MULTI_BIT> *>(buffer);
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<
uint64_t>(stream, gpu_index, d_lwe_ct_out_array,
d_lwe_output_indexes, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in, d_lwe_input_indexes,
d_bsk, typed, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, pbs_base_log, pbs_level,
number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
&pbs_buffer);
}
TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_tbc) {
if (!supports_multibit_tbc()) {
GTEST_SKIP() << "TBC multibit PBS is not supported on this architecture.";
}
int8_t *pbs_buffer = nullptr;
scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, true);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
cuda_multi_bit_programmable_bootstrap_tbc_64_generic_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_bsk, buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
&pbs_buffer);
}
TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_tbc_2_2) {
if (!supports_multibit_tbc()) {
GTEST_SKIP() << "TBC multibit PBS is not supported on this architecture.";
}
if (!(polynomial_size == 2048 && grouping_factor == 4 && pbs_level == 1 &&
glwe_dimension == 1 && pbs_base_log == 22)) {
GTEST_SKIP() << "TBC specialized 2_2 requires N=2048, grouping_factor=4, "
"glwe=1, level=1, base_log=22.";
}
int8_t *pbs_buffer = nullptr;
scratch_cuda_multi_bit_programmable_bootstrap_tbc_2_2_64_async(
stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
pbs_level, number_of_inputs, true);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
run_and_check_pbs(
[&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
cuda_multi_bit_programmable_bootstrap_tbc_64_2_2_async(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_bsk, buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
},
pbs_buffer);
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
&pbs_buffer);
// Copy result to the host memory
cuda_memcpy_async_to_cpu(
lwe_ct_out_array, d_lwe_ct_out_array,
safe_mul_sizeof<uint64_t>(
safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
(size_t)number_of_inputs),
stream, gpu_index);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
EXPECT_NE(decrypted, plaintext)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
}
}
}
}
/**
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
DynamicDistribution lwe_noise_distribution;
DynamicDistribution glwe_noise_distribution;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int grouping_factor;
int repetitions;
int samples;
*/
// Defines for which parameters set the PBS will be tested.
// It executes each src for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<MultiBitProgrammableBootstrapTestParams>
multipbs_params_u64 = ::testing::Values(
// V1_4_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128
// V1_1_PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
(MultiBitProgrammableBootstrapTestParams){
760, 1, 2048, new_t_uniform(49), new_t_uniform(17), 22, 1, 2, 2, 10,
4, 1, 1},
918, 1, 4096, new_t_uniform(45), new_t_uniform(3), 21, 1, 4, 4, 100,
2, 1, 1},
// V1_1_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
(MultiBitProgrammableBootstrapTestParams){
879, 1, 2048, new_t_uniform(46), new_t_uniform(17), 14, 2, 4, 4,
100, 3, 1, 1},
// V1_1_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
(MultiBitProgrammableBootstrapTestParams){
920, 1, 2048, new_t_uniform(45), new_t_uniform(17), 22, 1, 4, 4, 10,
4, 1, 1});
920, 1, 2048, new_t_uniform(45), new_t_uniform(17), 22, 1, 4, 4,
100, 4, 1, 1},
// V1_1_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128
// This test is here only to be sure we don't break support to
// 8192-degree polynomials
(MultiBitProgrammableBootstrapTestParams){
1040, 1, 8192, new_t_uniform(42), new_t_uniform(3), 14, 2, 4, 4,
100, 4, 1, 1});
std::string printParamName(
::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {

View File

@@ -3162,6 +3162,64 @@ unsafe extern "C" {
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_amortized_64_async(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32_async(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
pbs_buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
);
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_amortized_64_async(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
pbs_buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_programmable_bootstrap_amortized_64(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_64_async(
stream: *mut ffi::c_void,

View File

@@ -4,8 +4,8 @@ version = "0.4.0"
edition = "2021"
license = "BSD-3-Clause-Clear"
description = "HPU implementation on FPGA of TFHE-rs primitives."
homepage = "https://www.zama.org/"
documentation = "https://docs.zama.org/tfhe-rs"
homepage = "https://www.zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
readme = "README.md"
keywords = ["encryption", "fhe", "cryptography", "hardware", "fpga"]

View File

@@ -6,8 +6,8 @@ rust-version.workspace = true
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"
description = "Cuda implementation of TFHE-rs' ZK primitives."
homepage = "https://www.zama.org/"
documentation = "https://docs.zama.org/tfhe-rs"
homepage = "https://www.zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
readme = "README.md"
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

View File

@@ -1,175 +0,0 @@
# Naming Conventions
This document defines the naming conventions used throughout the zk-cuda-backend codebase.
## Types and Structs
**Rule: `PascalCase`, no underscores.**
| Category | Pattern | Examples |
|----------|---------|----------|
| Field elements | Short math names | `Fp`, `Fp2` (future: `Fp6`, `Fp12`) |
| Big integers | `BigInt<N>` template | `BigInt<7>`, `BigInt<5>` |
| Scalars | Alias of BigInt | `Scalar` (= `BigInt<ZP_LIMBS>`) |
| Curve points (affine) | `G{1,2}Affine` | `G1Affine`, `G2Affine` |
| Curve points (projective) | `G{1,2}Projective` | `G1Projective`, `G2Projective` |
| Enums | `PascalCase` | `ComparisonType` |
**FFI boundary types (api.h):** Use `G1Point`/`G2Point` (affine) and `G1ProjectivePoint`/`G2ProjectivePoint` for C compatibility. Type aliases bridge to internal names.
**Rust types** mirror internal CUDA names: `G1Affine`, `G1Projective`, `G2Affine`, `G2Projective`, `Scalar`.
## Template Parameters and Trait Structs
**Template parameters: `PascalCase` with descriptive suffix.**
| Parameter | Used in |
|-----------|---------|
| `AffineType` | MSM kernels, launch params |
| `ProjectiveType` | MSM kernels, launch params |
| `PointType` | Generic point operations |
| `FieldType` | Trait associated type |
| `N` | `BigInt<N>` |
**Trait structs:**
| Struct | Purpose | Location |
|--------|---------|----------|
| `Affine<T>` | Affine point ops dispatch | `point_traits.h` |
| `Projective<T>` | Projective point ops dispatch | `point_traits.h` |
| `SelectorChooser<T>` | Maps point type -> trait struct | `point_traits.h` |
| `MSMTraits<T>` | Maps projective -> affine type | `curve.h` |
| `MSMWindowSize<T>` | Window size constant per type | `msm.h` |
| `Phase1KernelLaunchParams<T>` | Kernel config for accumulation | `msm_pippenger.cu` |
| `Phase2KernelLaunchParams<T>` | Kernel config for reduction | `msm_pippenger.cu` |
## Functions
### Field Arithmetic
**Rule: `<field>_<operation>` — lowercase snake_case, field prefix.**
| Pattern | Meaning | Examples |
|---------|---------|---------|
| `fp_<op>` | Basic operation | `fp_add`, `fp_sub`, `fp_neg`, `fp_copy`, `fp_cmp` |
| `fp_is_<pred>` | Predicate | `fp_is_zero`, `fp_is_one`, `fp_is_quadratic_residue` |
| `fp_mont_<op>` | Montgomery-domain operation | `fp_mont_mul`, `fp_mont_inv`, `fp_mont_reduce` |
| `fp_<op>_raw` | No modular reduction | `fp_add_raw`, `fp_sub_raw`, `fp_mul_schoolbook_raw` |
| `fp_to_montgomery` / `fp_from_montgomery` | Form conversion | |
| `fp_<constant>` | Return constant (normal form) | `fp_zero`, `fp_one`, `fp_modulus` |
| `fp_<constant>_montgomery` | Return constant (Montgomery) | `fp_one_montgomery`, `fp_two_montgomery` |
Fp2 follows identical patterns with `fp2_` prefix.
### Point Operations (Generic Template)
**Rule: `point_<operation>` for G1/G2-generic operations.**
```
point_add, point_double, point_neg, point_scalar_mul
point_at_infinity, point_to_montgomery, point_from_montgomery
point_to_montgomery_batch
```
### Point Operations (Group-Specific)
**Rule:** Group **leads** when it "owns" the concept; group **trails** when the operation is primary.
| Group leads | Group trails |
|-------------|-------------|
| `g1_point_at_infinity` | `projective_to_affine_g1` |
| `g1_is_infinity` | `normalize_projective_g1` |
| `g1_generator` | `is_on_curve_g1` |
| `g1_projective_point_at_infinity` | `curve_b_g1` |
Overloaded functions omit the group entirely.
### Projective Point Operations
**Rule: `projective_<operation>` prefix.**
`projective_point_add`, `projective_point_double`, `projective_mixed_add`, `projective_scalar_mul` — all overloaded for G1/G2.
### In-Place Host Operations
**Rule: `_inplace` suffix for host-only in-place modifications.**
`point_to_montgomery_inplace`, `point_from_montgomery_inplace`
The CUDA template batch functions (`point_to_montgomery_batch`) are also in-place but omit `_inplace` — this is intentional. The `_inplace` suffix distinguishes the host-only path from the CUDA template path.
### MSM Functions
Internal: `point_msm_g1_async`, `point_msm_g1`, `pippenger_scratch_size_g1` (group suffix).
### CUDA Kernels
**Rule: `kernel_<descriptive_name>` prefix.**
`kernel_accumulate_all_windows`, `kernel_reduce_all_windows`, `kernel_compute_window_sums`, `kernel_clear_buckets`, `kernel_point_add`, `kernel_point_to_montgomery_batch`, etc.
### FFI Wrappers
**Rule: `*_wrapper` suffix.** Group position follows the underlying function's convention:
- Group prefix: `g1_msm_managed_wrapper`, `g1_msm_unmanaged_wrapper_async`, `g1_from_montgomery_wrapper`
- Group suffix: `affine_to_projective_g1_wrapper`, `is_on_curve_g1_wrapper`, `pippenger_scratch_size_g1_wrapper`
- No group: `fp_to_montgomery_wrapper`, `scalar_modulus_limbs_wrapper`
### Rust API
Standard Rust `snake_case`: `to_projective()`, `from_montgomery_normalized()`, `is_infinity()`, `msm()`.
Module-level conversions: `g1_affine_from_montgomery()`, `g1_affine_from_arkworks()`.
## Variables
**Rule: `snake_case` everywhere.**
| Convention | Examples |
|------------|---------|
| Device pointers: `d_` prefix | `d_result`, `d_points`, `d_scratch` |
| Host pointers: no prefix | `result`, `points` |
| Counts: `num_*` | `num_points`, `num_blocks`, `num_windows` |
| Indices: `*_idx` | `window_idx`, `bucket_idx`, `point_idx` |
| Memory sizes: `*_bytes` | `points_bytes`, `scratch_bytes` |
| Booleans: descriptive | `valid`, `overflow`, `points_in_montgomery` |
| Shared memory: `shared_*` | `shared_mem`, `shared_points`, `shared_sums` |
| CUDA params | `stream`, `gpu_index`, `size_tracker` |
## Constants and Macros
**Rule: `UPPER_SNAKE_CASE`.**
| Prefix | Category | Examples |
|--------|----------|---------|
| `FP_` | Field parameters | `FP_LIMBS`, `FP_BITS` |
| `ZP_` | Scalar field | `ZP_LIMBS` |
| `LIMB_` | Limb config | `LIMB_BITS`, `LIMB_MAX` |
| `MSM_G1_` / `MSM_G2_` | MSM per-group | `MSM_G1_WINDOW_SIZE`, `MSM_G2_BUCKET_COUNT` |
| `MSM_` | MSM shared | `MSM_WINDOW_SIZE`, `MSM_SIGNED_BUCKET_COUNT` |
| `KERNEL_` | Kernel config | `KERNEL_THREADS_MAX` |
| `CUDA_` | CUDA arch | `CUDA_WARP_SIZE` |
| `BLS12_446_` | Curve constants | `BLS12_446_MODULUS_LIMBS` |
| `DEVICE_` | `__constant__` memory | `DEVICE_MODULUS`, `DEVICE_R2`, `DEVICE_G1_GENERATOR` |
## Files
| Category | Location | Naming |
|----------|----------|--------|
| CUDA public headers | `cuda/include/*.h` | `fp.h`, `curve.h`, `msm.h`, `point_traits.h` |
| CUDA internal headers | `cuda/src/**/*.cuh` | `common.cuh` |
| CUDA source | `cuda/src/**/*.cu` | `fp.cu`, `curve.cu`, `msm_pippenger.cu` |
| Rust modules | `src/` | `snake_case`: `types`, `conversions`, `bindings`, `g1`, `g2`, `scalar` |
## Async/Sync Pair Convention
```
<operation>_async — launch kernel(s), return immediately
<operation> — call _async, then synchronize
```
**`_async` suffix** for non-blocking; **no suffix** for synchronizing.
**Rule: `_async` is always the last component of the name**, even on `_wrapper` functions.
For example: `point_msm_g1_async` (not `point_msm_async_g1`), `g1_msm_unmanaged_wrapper_async` (not `g1_msm_unmanaged_async_wrapper`).

View File

@@ -1,167 +1,491 @@
# ZK CUDA Backend
A CUDA implementation of BLS12-446 elliptic curve operations for zero-knowledge proof systems.
It provides GPU-accelerated finite field arithmetic, elliptic curve point operations, and
multi-scalar multiplication (MSM) targeting NVIDIA GPUs.
A high-performance CUDA implementation of BLS12-446 elliptic curve operations for zero-knowledge proof systems. This library provides GPU-accelerated finite field arithmetic, elliptic curve point operations, and multi-scalar multiplication (MSM) optimized for NVIDIA GPUs.
The cryptographic operations it provides are:
## Overview
This project implements a CUDA backend for BLS12-446 elliptic curve operations, which are fundamental to zero-knowledge proof systems. The implementation focuses on performance and correctness, providing both host and device-side APIs for maximum flexibility.
**Key Features:**
- Multi-precision finite field arithmetic (Fp) with Montgomery reduction
- Quadratic extension field (Fp2) operations
- Elliptic curve point operations for G1 (over Fp) and G2 (over Fp2) groups
- Elliptic curve operations for G1 and G2 groups
- High-performance Multi-Scalar Multiplication (MSM) using Pippenger's algorithm
- Comprehensive test suite with 100+ tests
- Performance benchmarks
- Rust API bindings
## Project Structure
```
zk-cuda-backend/
├── include/ # Header files
│ ├── fp.h # Fp (finite field) declarations
│ ├── fp2.h # Fp2 (quadratic extension) declarations
│ ├── curve.h # Elliptic curve point operations
│ └── msm.h # Multi-scalar multiplication API
│ # Note: device.h comes from tfhe-cuda-backend
├── src/ # CUDA source files
│ ├── primitives/
│ │ ├── fp.cu # Fp implementation
│ │ └── fp2.cu # Fp2 implementation
│ ├── curve.cu # Curve operations
│ └── msm/ # MSM implementation
│ └── pippenger/ # Pippenger's algorithm
├── tests/ # Test suite
│ ├── primitives/ # Fp and Fp2 tests
│ ├── test_msm.cu # MSM tests
│ ├── test_point_ops.cu # Point operation tests
│ └── test_scalar_mul.cu # Scalar multiplication tests
├── benchmarks/ # Performance benchmarks
│ ├── benchmark_fp.cu # Fp benchmarks
│ ├── benchmark_fp2.cu # Fp2 benchmarks
│ └── benchmark_msm.cu # MSM benchmarks
├── src/ # Rust bindings
│ ├── src/ # Rust source code
│ └── include/ # C wrapper headers
└── utils/ # Utility scripts
```
## BLS12-446 Curve
This implementation targets the **BLS12-446** curve:
- **446-bit prime field** (Fp): 7 limbs of 64 bits (448 bits total, 2 bits headroom)
This implementation targets the **BLS12-446** curve, which uses:
- **446-bit prime field** (Fp): Requires 7 limbs of 64 bits each
- **Two groups**: G1 (over Fp) and G2 (over Fp2)
- **Modulus**: Hardcoded from tfhe-rs reference implementation
## API
The modulus and all curve constants are initialized at compile time and available as device constants for optimal performance.
## Components
### Finite Field Arithmetic (Fp and Fp2)
**Fp** — multi-precision arithmetic for the 446-bit prime field:
- Operations: `fp_add()`, `fp_sub()`, `fp_mul()`, `fp_neg()`, `fp_inv()`, `fp_div()`, `fp_pow()`, `fp_sqrt()`, Montgomery conversions
- Operator overloads: `+`, `-`, `*`, `/`, unary `-`, `+=`, `-=`, `*=`, `/=`, `==`, `!=`
- Montgomery form: `fp_to_montgomery()` / `fp_from_montgomery()` for conversion; `fp_one_montgomery()` etc. for constants
**Fp** - Multi-precision arithmetic for the 446-bit prime field:
- **Structure**: 7 limbs of 64 bits each (448 bits total, 2 bits headroom)
- **Montgomery Reduction**: R = 2^448, matching tfhe-rs implementation
- **Format Tracking**: `mont` field tracks whether values are in Montgomery form
- **Operations**: `fp_add()`, `fp_sub()`, `fp_mul()`, `fp_neg()`, `fp_inv()`, `fp_div()`, `fp_pow()`, `fp_sqrt()`, Montgomery conversions, etc.
**Fp2** — quadratic extension field (Fp2 = Fp[i], i² = 1):
- Operations: `fp2_add()`, `fp2_sub()`, `fp2_mul()`, `fp2_neg()`, `fp2_inv()`, `fp2_div()`, `fp2_square()`, `fp2_conjugate()`, `fp2_frobenius()`
**Fp2** - Quadratic extension field (Fp2 = Fp[i] where i² = -1):
- **Structure**: Two Fp elements (c0, c1) representing a + b*i
- **Operations**: `fp2_add()`, `fp2_sub()`, `fp2_mul()`, `fp2_neg()`, `fp2_inv()`, `fp2_div()`, `fp2_square()`
- **Special**: `fp2_conjugate()`, `fp2_frobenius()`, `fp2_mul_by_i()`
**Operator Overloads** (both Fp and Fp2):
- Arithmetic: `+`, `-`, `*`, `/`, unary `-`
- Compound assignment: `+=`, `-=`, `*=`, `/=`
- Comparison: `==`, `!=`
- Assignment: `=` (replaces `fp_copy()` / `fp2_copy()`)
**CUDA Kernels**: Batch operations for GPU execution
### Elliptic Curve Operations
Point representations:
- **Affine**: `G1Affine`, `G2Affine` — (x, y) with infinity flag
- **Projective**: `G1Projective`, `G2Projective` — (X, Y, Z) homogeneous coordinates
Complete implementation for both G1 and G2 groups:
Operations (template functions work for both G1 and G2):
- `point_add()`, `point_double()`, `point_neg()`, `point_scalar_mul()`
- `affine_to_projective()`, `projective_to_affine_g1()`, `projective_to_affine_g2()`
- `point_to_montgomery_inplace()`, `normalize_from_montgomery_g1()` / `normalize_from_montgomery_g2()`
- Operator overloads on projective points: `+`, unary `-`, `*` (scalar), `+=`, `==`, `!=`
- Generator access: `g1_generator()`, `g2_generator()`
- **Point Representations**:
- **Affine**: (x, y) coordinates with infinity flag (`G1Affine`, `G2Affine`)
- **Projective**: (X, Y, Z) homogeneous coordinates (`G1Projective`, `G2Projective`)
- **Operations**:
- Point addition: `point_add()`
- Point doubling: `point_double()`
- Point negation: `point_neg()`
- Scalar multiplication: `point_scalar_mul()`, `projective_scalar_mul()`
- Coordinate conversion: `affine_to_projective()`, `projective_to_affine()`
- **Operator Overloads** (Projective points):
- Addition: `+` (point addition)
- Negation: unary `-` (point negation)
- Scalar multiplication: `*` (with `Scalar` type)
- Compound assignment: `+=`
- Comparison: `==`, `!=`
- Assignment: `=` (replaces `point_copy()`)
- **Template API**: Generic functions that work for both G1 and G2 points
- **Generator Points**: Hardcoded G1 and G2 generators for BLS12-446
### Multi-Scalar Multiplication (MSM)
Implements Pippenger's bucket method. Window sizes are selected dynamically:
- **G1**: 4-bit windows for n ≤ 256, 5-bit for n ≤ 4096, larger for bigger inputs
- **G2**: fixed 5-bit windows (Fp2 operations are 2× more expensive)
High-performance MSM implementation:
**Unmanaged API** — caller manages all device memory:
```c
// Query required scratch space, then run MSM.
size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
G1Projective *d_scratch = (G1Projective *)cuda_malloc(scratch_bytes, gpu_index);
point_msm_g1(stream, gpu_index, d_result, d_points, d_scalars, n,
d_scratch, size_tracker, /*gpu_memory_allocated=*/true);
```
- **Algorithm**: Pippenger's bucket method with configurable window sizes
- **Window Sizes**:
- **G1**: 4-bit windows (16 buckets: 0-15)
- **G2**: 5-bit windows (32 buckets: 0-31) - larger windows reduce Horner doublings for more expensive Fp2 operations
- **Features**:
- Supports both G1 and G2 groups
- Uses projective coordinates internally (no inversions)
- Optimized for large batch sizes
- Register-based bucket accumulation for optimal performance
**Managed API** — Rust bindings handle memory allocation and transfers internally:
```rust
let (result, size_tracker) = G1Projective::msm(&points, &scalars, stream, gpu_index, false)?;
```
- **API**:
- BigInt scalars (320-bit, 5 limbs): `point_msm_g1()`, `point_msm_g2()`
- Async/Sync variants: `point_msm_async_*()` and `point_msm_*()`
- **Managed API**: Handles memory allocation and transfers internally (convenient for Rust bindings)
- **Unmanaged API**: Assumes data already on device, caller manages memory (better performance for pure-GPU workflows)
See the [basic examples](cuda/tests_and_benchmarks/tests/basic/) for complete working programs.
## Dependencies
- **Memory**: Device pointer-based API (caller manages memory allocation for unmanaged API)
**Disclaimer**: Compilation on Windows/Mac is not supported. Only Nvidia GPUs are supported.
## Building
- nvidia driver — GPU with Compute Capability ≥ 3.0 (e.g. Ubuntu 20.04: [installation guide](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux))
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) ≥ 10.0
- [gcc](https://gcc.gnu.org/) ≥ 8.0 — see [nvcc/gcc compatibility](https://gist.github.com/ax3l/9489132)
- [cmake](https://cmake.org/) ≥ 3.24
- libclang ≥ 9.0 — for Rust [bindgen requirements](https://rust-lang.github.io/rust-bindgen/requirements.html)
### Dependencies
Dependencies fetched automatically by CMake: Google Test, Google Benchmark.
**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported.
## Build
- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation. You need an Nvidia GPU with Compute Capability >= 3.0
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
- [cmake](https://cmake.org/) >= 3.24
- libclang, to match Rust bingen [requirements](https://rust-lang.github.io/rust-bindgen/requirements.html) >= 9.0
Dependencies (automatically fetched by CMake):
- Google Test (for testing)
- Google Benchmark (for benchmarks)
### Build Instructions
```bash
cd cuda
cmake -B build
cmake --build build
# Create build directory
mkdir -p build
cd build
# Configure
cmake ..
# Build
cmake --build .
# Or use make
make
```
The compute capability is detected automatically from the first available GPU.
If no GPU is present, the build targets sm_70 (Volta).
### Building Rust API
The Rust API build automatically compiles the CUDA library via `build.rs`. Simply run:
```bash
# From the zk-cuda-backend directory (backends/zk-cuda-backend/)
cargo build --release
```
This will:
1. Automatically configure and build the CUDA library in `cuda/build/` if needed
2. Compile the Rust bindings
3. Link everything together
**Manual CUDA build** (if you need to build the CUDA library separately):
```bash
# Build the C++/CUDA library manually
cd cuda
mkdir -p build
cd build
cmake ..
make
```
## Usage
### C++/CUDA API
#### Basic Fp Operations
```cpp
#include "fp.h"
// Initialize values
Fp a, b, c;
fp_one(a); // a = 1
fp_one(b); // b = 1
// Using operator syntax (preferred)
c = a + b; // c = 2
c = a - b; // c = 0
c = a * b; // c = 1
c = -a; // c = -1 (mod p)
// Compound assignment
c += a; // c = c + a
c *= b; // c = c * b
// Assignment (copies value)
Fp d = a; // d is a copy of a
// Named functions still available
fp_add(c, a, b); // c = a + b = 2
// Convert to Montgomery form
fp_to_montgomery(a, a);
// Montgomery multiplication
fp_mont_mul(c, a, b); // c = a * b (all in Montgomery form)
```
#### Elliptic Curve Operations
```cpp
#include "curve.h"
// Create points
G1Projective p1, p2, result;
// ... initialize point coordinates ...
// Using operator syntax (projective points)
result = p1 + p2; // Point addition
result = -p1; // Point negation
result += p2; // Compound addition
// Scalar multiplication with Scalar type
Scalar s;
// ... initialize scalar ...
result = p1 * s; // result = scalar * point
result = s * p1; // Same as above
// Assignment (copies point)
G1Projective copy = p1;
// Named functions still available for affine points
G1Affine affine_point, affine_result;
uint64_t scalar[5] = {0x1234, 0, 0, 0, 0};
point_scalar_mul(affine_result, affine_point, scalar, 5);
```
#### Multi-Scalar Multiplication
```cpp
#include "msm.h"
#include "device.h" // From tfhe-cuda-backend
// Allocate device memory
G1Affine* d_points;
Scalar* d_scalars; // BigInt (320-bit scalars, 5 limbs)
G1Projective* d_result;
G1Projective* d_scratch;
// Calculate scratch space size
uint32_t n = 1000; // number of points
uint32_t num_blocks = (n + 255) / 256;
size_t scratch_size = (num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
// Allocate memory using device wrappers
uint32_t gpu_index = 0;
d_points = (G1Affine*)cuda_malloc(n * sizeof(G1Affine), gpu_index);
d_scalars = (Scalar*)cuda_malloc(n * sizeof(Scalar), gpu_index);
d_result = (G1Projective*)cuda_malloc(sizeof(G1Projective), gpu_index);
d_scratch = (G1Projective*)cuda_malloc(scratch_size, gpu_index);
// Create stream and copy data to device
cudaStream_t stream = cuda_create_stream(gpu_index);
cuda_memcpy_async_to_gpu(d_points, h_points, n * sizeof(G1Affine), stream, gpu_index);
cuda_memcpy_async_to_gpu(d_scalars, h_scalars, n * sizeof(Scalar), stream, gpu_index);
// Perform MSM
point_msm_g1(stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n);
// Copy result back and synchronize
G1Projective result;
cuda_memcpy_async_to_cpu(&result, d_result, sizeof(G1Projective), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Cleanup
cuda_drop(d_points, gpu_index);
cuda_drop(d_scalars, gpu_index);
cuda_drop(d_result, gpu_index);
cuda_drop(d_scratch, gpu_index);
cuda_destroy_stream(stream, gpu_index);
```
### Rust API
The Rust build compiles the CUDA library automatically via `build.rs`:
See the [Rust API README](src/README.md) for detailed usage examples.
```bash
# From backends/zk-cuda-backend/
cargo build --release
```rust
use zk_cuda_backend::{G1Affine, G1Projective, Scalar};
use tfhe_cuda_backend::cuda_create_stream;
// Create points and scalars
let points: Vec<G1Affine> = vec![...];
let scalars: Vec<Scalar> = vec![...];
// Create a CUDA stream (required for MSM)
let gpu_index = 0;
let stream = cuda_create_stream(gpu_index);
// Perform MSM using managed API
// The managed API handles memory allocation and transfers internally
let (result, size_tracker) = G1Projective::msm(
&points,
&scalars,
stream,
gpu_index,
false, // points_in_montgomery: false means points will be converted
)?;
// For G2 points:
use zk_cuda_backend::{G2Affine, G2Projective};
let (g2_result, _) = G2Projective::msm(
&g2_points,
&scalars,
stream,
gpu_index,
true, // points_in_montgomery: true for better performance if already converted
)?;
```
## Testing
The project includes a comprehensive test suite using Google Test.
### Running Tests
```bash
cd cuda/build
ctest --output-on-failure # run all tests
./test_fp # individual test executables
# Run all tests
cd build
ctest --output-on-failure
# Run with verbose output
ctest --verbose
# Run specific test executables
./test_fp
./test_fp2
./test_msm
./test_point_ops
./test_fp --gtest_filter="*Montgomery*" # filter by name
# Run specific test cases
./test_fp --gtest_filter="*Montgomery*"
./test_msm --gtest_filter="*G1*"
```
Test coverage: Fp operations (22+ tests), Fp2 operations, G1/G2 point operations,
projective arithmetic, MSM correctness for various batch sizes.
### Test Coverage
- **Fp Tests** (`test_fp`): 22+ tests covering:
- Basic operations (addition, subtraction, multiplication)
- Montgomery form conversions
- Edge cases (zero, one, large values)
- Property-based tests (commutativity, associativity)
- **Fp2 Tests** (`test_fp2`): Complete coverage of:
- All Fp2 operations
- Montgomery form operations
- Special functions (Frobenius, conjugation)
- **Point Operation Tests** (`test_point_ops`): Verification of:
- Point addition and doubling
- Scalar multiplication
- Coordinate conversions
- Infinity point handling
- **MSM Tests** (`test_msm`): End-to-end verification:
- G1 and G2 MSM correctness
- Various batch sizes
- Comparison with reference implementations
## Benchmarks
Performance benchmarks are available using Google Benchmark:
```bash
cd cuda/build
cd build
./benchmark_fp
./benchmark_fp2
./benchmark_msm
```
## Technical Notes
Benchmarks measure:
- Fp arithmetic operation throughput
- Fp2 operation performance
- MSM performance for various batch sizes
- GPU utilization and memory bandwidth
## Technical Details
### Montgomery Reduction
All internal multiplications use Montgomery form (R = 2^448, matching tfhe-rs).
Precomputed constants: R² mod p, R_INV mod p, p' = p⁻¹ mod 2⁶⁴.
The `mont` convention: functions documented "MONTGOMERY" expect inputs already in
Montgomery form; "NORMAL" functions handle conversion internally.
- **R value**: 2^448 (matching tfhe-rs)
- **Precomputed constants**: R² mod p, R_INV mod p, p' = -p⁻¹ mod 2⁶⁴
- **Format tracking**: Fp struct includes `mont` field to track representation
- **Efficiency**: All multiplications use Montgomery form internally
### MSM Algorithm
- **Pippenger's algorithm**: Bucket method with configurable window sizes
- **G1**: 4-bit windows (16 buckets)
- **G2**: 5-bit windows (32 buckets) - larger windows reduce expensive Fp2 field operations
- **Projective coordinates**: Avoids expensive field inversions
- **Memory layout**: Optimized for coalesced memory access
- **Thread configuration**: 128 threads/block for both G1 and G2 (optimized for H100 SM occupancy)
- **Register-based accumulation**: Uses register-based bucket accumulation instead of shared memory for better performance
### Memory Management
- **Unmanaged API** (`point_msm_g1`, `point_msm_g2`): all data must be on device;
caller manages allocation and transfers. Use `pippenger_scratch_size_g1/g2()` to
query the required scratch buffer size.
- **Managed API** (Rust `G1Projective::msm()`, `G2Projective::msm()`): handles
allocation, host-to-device copies, and scratch space automatically.
The library provides two MSM API variants:
- **Unmanaged API** (`point_msm_*_unmanaged_wrapper`):
- Assumes all data (points, scalars, scratch space) is already on device
- Caller manages all memory allocation and transfers
- Best for performance-critical applications where data is already on GPU
- Supports `points_in_montgomery` flag to avoid redundant conversions
- **Managed API** (`point_msm_*_managed_wrapper`):
- Handles memory allocation and transfers internally
- Copies data from host to device, runs MSM, copies result back
- Convenient for Rust bindings and host-side code
- Automatically manages scratch space allocation
- **Scratch space**: Required size is `(num_blocks + 1) * BUCKET_COUNT * sizeof(ProjectivePoint)`
- G1: `(num_blocks + 1) * 16 * sizeof(G1Projective)`
- G2: `(num_blocks + 1) * 32 * sizeof(G2Projective)`
- **Stream support**: Async operations with CUDA streams (all operations are async internally)
### CUDA Optimizations
- **Constant memory**: Modulus and curve constants in `__constant__` memory
- **Shared memory**: Used for bucket accumulations in MSM
- **Coalesced access**: Memory access patterns optimized for GPU
- **Separable compilation**: Enabled for better optimization
## Template Functions
Many functions are templated to work with both G1 and G2 points:
```cpp
template<typename PointType>
void point_add(PointType& result, const PointType& p1, const PointType& p2);
```
## Security
### Side-Channel Resistance
This implementation assumes **scalars are public** and is **not** constant-time.
Do not use it for operations where scalars must remain secret.
For ZK proof generation this is acceptable when scalars are derived from public
parameters or are witness values revealed in the proof.
This implementation assumes **scalars are public** and is NOT constant-time.
The MSM and scalar multiplication operations have timing variations that depend
on scalar values (bit length, Hamming weight, specific bit patterns).
For ZK proof generation, this is acceptable if:
- Scalars are derived from public parameters
- Or are witness values that are revealed in the proof anyway
**Do not use this implementation for operations where scalars must remain secret.**
### Input Validation
- **Point validation**: off by default; enable with the `validate_points` feature:
- **Point validation**: Point on-curve validation is optional and controlled by the
`validate_points` feature flag. When disabled (default), malformed points may cause
undefined behavior in curve operations. Enable this feature for untrusted inputs:
```toml
zk-cuda-backend = { version = "...", features = ["validate_points"] }
```
- **Scalar validation**: `Scalar::is_valid()` and `Scalar::reduce_once()` available in the Rust API.
- **Scalar validation**: `Scalar::is_valid()` and `Scalar::reduce_once()` methods available
- **Input size limits**: MSM operations are limited to 100,000 points maximum
- **Division by zero**: Caller must ensure division by zero does not occur (checks must be done at host side)
## Naming Conventions
See [NAMING_CONVENTIONS.md](NAMING_CONVENTIONS.md) for the full reference.
For detailed security information, see [SECURITY.md](SECURITY.md).
## References
- [Pairing-Friendly Curves (BLS12)](https://eprint.iacr.org/2006/372.pdf)
- [Montgomery ReductionHandbook of Applied Cryptography](https://cacr.uwaterloo.ca/hac/)
- [Pippenger's Algorithm](https://eprint.iacr.org/2012/549.pdf)
- [NVIDIA CUDA Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/)
- [tfhe-rs BLS12-446 reference](https://github.com/zama-ai/tfhe-rs/blob/main/tfhe-zk-pok/src/curve_446/mod.rs)
- **BLS12 Curves**: [Pairing-Friendly Curves](https://eprint.iacr.org/2006/372.pdf)
- **Montgomery Reduction**: [Handbook of Applied Cryptography](https://cacr.uwaterloo.ca/hac/)
- **Pippenger's Algorithm**: [On the Evaluation of Powers and Monomials](https://eprint.iacr.org/2012/549.pdf)
- **CUDA Best Practices**: [NVIDIA CUDA Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/)
- **TFHE-rs Reference**: [tfhe-rs/tfhe-zk-pok/src/curve_446/mod.rs](https://github.com/zama-ai/tfhe-rs/blob/main/tfhe-zk-pok/src/curve_446/mod.rs)

View File

@@ -46,10 +46,7 @@ fn main() {
}
// Build CUDA library using cmake crate
let limb_bits = std::env::var("ZK_CUDA_LIMB_BITS").unwrap_or_else(|_| "64".to_string());
println!("cargo::rerun-if-env-changed=ZK_CUDA_LIMB_BITS");
let mut cmake_config = cmake::Config::new("cuda");
cmake_config.define("ZK_CUDA_LIMB_BITS", &limb_bits);
let dest = cmake_config.build();
// cmake crate installs to dest/lib subdirectory
@@ -109,7 +106,7 @@ fn main() {
let bindings = bindgen::Builder::default()
.header(header_path.to_str().unwrap())
// Allow only the wrapper functions (C FFI interface)
.allowlist_function(".*_wrapper(_async)?")
.allowlist_function(".*_wrapper")
// Allow the core types needed for FFI
.allowlist_type("G1Point")
.allowlist_type("G2Point")

View File

@@ -51,16 +51,6 @@ else()
set(CMAKE_CUDA_ARCHITECTURES 70)
endif()
# Limb size configuration: 32 or 64 (default: 64)
# 32-bit limbs enable PTX carry-chain optimizations on GPU
set(ZK_CUDA_LIMB_BITS "64" CACHE STRING "Limb size in bits for Fp arithmetic (32 or 64)")
set_property(CACHE ZK_CUDA_LIMB_BITS PROPERTY STRINGS "32" "64")
if(NOT ZK_CUDA_LIMB_BITS STREQUAL "32" AND NOT ZK_CUDA_LIMB_BITS STREQUAL "64")
message(FATAL_ERROR "ZK_CUDA_LIMB_BITS must be 32 or 64, got: ${ZK_CUDA_LIMB_BITS}")
endif()
add_compile_definitions(LIMB_BITS_CONFIG=${ZK_CUDA_LIMB_BITS})
message(STATUS "Limb size: ${ZK_CUDA_LIMB_BITS}-bit")
# Enable CUDA separable compilation for better optimization
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -17,15 +17,7 @@ __host__ __device__ void fp2_zero(Fp2 &a);
// G1 point: (x, y) coordinates in Fp
// Curve equation: y^2 = x^3 + b (short Weierstrass form with a = 0)
// alignas(8) ensures identical struct layout (size 120) in both 32-bit and
// 64-bit limb modes, matching the Rust FFI bindings generated from 64-bit.
// Without this, 32-bit mode produces 116-byte structs (4-byte alignment from
// uint32_t limbs) vs 120 bytes in Rust FFI, causing array stride mismatches
// that corrupt point data for n>1.
// The 4-byte padding overhead is negligible: MSM is compute-bound (Montgomery
// multiplications dominate), and point access patterns in Pippenger-style MSM
// are non-coalescing regardless of struct size.
struct alignas(8) G1Affine {
struct G1Affine {
Fp x;
Fp y;
bool infinity; // true if point at infinity (identity element)
@@ -44,9 +36,7 @@ struct alignas(8) G1Affine {
// G2 point: (x, y) coordinates in Fp2
// Curve equation: y^2 = x^3 + b' (twisted curve over Fp2)
// alignas(8): same rationale as G1Affine above — ensures FFI layout
// compatibility (size 232) between 32-bit and 64-bit limb modes.
struct alignas(8) G2Affine {
struct G2Affine {
Fp2 x;
Fp2 y;
bool infinity; // true if point at infinity (identity element)
@@ -241,6 +231,13 @@ __host__ __device__ const G2Affine &g2_generator();
// points, significantly reducing the number of point operations compared to
// naive methods
// Pippenger algorithm constants
#define MSM_WINDOW_SIZE 4 // 4-bit windows
#define MSM_G1_BUCKET_COUNT \
16 // 2^MSM_WINDOW_SIZE buckets (0-15) - legacy, kept for compatibility
#define MSM_SIGNED_BUCKET_COUNT \
8 // With signed recoding: buckets 1-8 (half the buckets)
// ============================================================================
// Template Async/Sync API for curve operations
// ============================================================================
@@ -335,9 +332,22 @@ void point_to_montgomery_batch(cudaStream_t stream, uint32_t gpu_index,
PointType *d_points, uint32_t n);
// ============================================================================
// MSM Traits (maps projective to affine point types, used by msm.h)
// Refactored MSM API (device pointers only, no allocations/copies/frees)
// ============================================================================
// All pointers are device pointers (already allocated by caller)
// Temporary buffer must be provided by caller:
// - d_scratch: buffer of size (num_blocks + 1) * MSM_G1_BUCKET_COUNT *
// sizeof(G1Point/G2Affine)
// where num_blocks = CEIL_DIV(n, threadsPerBlock) (typically
// 256 threads per block) This provides space for:
// * num_blocks * MSM_G1_BUCKET_COUNT points for per-block bucket
// accumulations
// * MSM_G1_BUCKET_COUNT points for final buckets
// MSM_G1_BUCKET_COUNT is typically 16 (for 4-bit windows)
// Uses Pippenger algorithm (bucket method) with sppark-style single-pass
// accumulation
// Simple traits for MSM template (maps projective to affine point types)
template <typename ProjectivePointType> struct MSMTraits;
template <> struct MSMTraits<G1Projective> {
@@ -348,4 +358,40 @@ template <> struct MSMTraits<G2Projective> {
using AffinePointType = G2Affine;
};
// MSM function declarations are in msm.h
// ============================================================================
// MSM with BigInt5 scalars (default MSM implementation)
// ============================================================================
// These functions accept BigInt5* scalars (320-bit scalars, 5 limbs)
// BigInt5 represents a scalar as 5 limbs of 64 bits (320 bits total)
// Uses projective coordinates internally (no inversions!)
// MSM with BigInt scalars for G1 (projective result)
void point_msm_async_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *d_result, const G1Affine *d_points,
const Scalar *d_scalars, G1Projective *d_scratch,
uint32_t n);
void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *d_result, const G1Affine *d_points,
const Scalar *d_scalars, G1Projective *d_scratch, uint32_t n);
// MSM with BigInt scalars for G2 (projective result)
void point_msm_async_g2(cudaStream_t stream, uint32_t gpu_index,
G2Projective *d_result, const G2Affine *d_points,
const Scalar *d_scalars, G2Projective *d_scratch,
uint32_t n);
void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
G2Projective *d_result, const G2Affine *d_points,
const Scalar *d_scalars, G2Projective *d_scratch, uint32_t n);
// Template MSM with BigInt scalars (works for both G1 and G2)
template <typename ProjectivePointType>
void point_msm_async(
cudaStream_t stream, uint32_t gpu_index, ProjectivePointType *d_result,
const typename MSMTraits<ProjectivePointType>::AffinePointType *d_points,
const Scalar *d_scalars, ProjectivePointType *d_scratch, uint32_t n);
template <typename ProjectivePointType>
void point_msm(
cudaStream_t stream, uint32_t gpu_index, ProjectivePointType *d_result,
const typename MSMTraits<ProjectivePointType>::AffinePointType *d_points,
const Scalar *d_scalars, ProjectivePointType *d_scratch, uint32_t n);

View File

@@ -124,11 +124,8 @@ static_assert(sizeof(Fp) == FP_LIMBS * sizeof(UNSIGNED_LIMB),
// Binary arithmetic operators
__host__ __device__ Fp operator+(const Fp &a, const Fp &b);
__host__ __device__ Fp operator-(const Fp &a, const Fp &b);
// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
// form.
// Binary multiplication: returns result in Montgomery form
__host__ __device__ Fp operator*(const Fp &a, const Fp &b);
// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
// form. Computes a * b^{-1} in Montgomery representation.
__host__ __device__ Fp operator/(const Fp &a, const Fp &b);
// Unary negation operator
@@ -142,7 +139,6 @@ __host__ __device__ bool operator!=(const Fp &a, const Fp &b);
__host__ __device__ Fp &operator+=(Fp &a, const Fp &b);
__host__ __device__ Fp &operator-=(Fp &a, const Fp &b);
__host__ __device__ Fp &operator*=(Fp &a, const Fp &b);
// MONTGOMERY: Both inputs must be in Montgomery form.
__host__ __device__ Fp &operator/=(Fp &a, const Fp &b);
// Prime modulus p for BLS12-446
@@ -269,21 +265,12 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a);
// Uses Euler's criterion: a is a quadratic residue if a^((p-1)/2) = 1 mod p
__host__ __device__ bool fp_is_quadratic_residue(const Fp &a);
// Small-constant multiplication via addition chains (much cheaper than
// fp_mont_mul). MONTGOMERY: input and output must be in Montgomery form.
__host__ __device__ void fp_double(Fp &c, const Fp &a);
__host__ __device__ void fp_mul3(Fp &c, const Fp &a);
__host__ __device__ void fp_mul4(Fp &c, const Fp &a);
__host__ __device__ void fp_mul8(Fp &c, const Fp &a);
// Conditional assignment: if condition, dst = src, else dst unchanged
__host__ __device__ void fp_cmov(Fp &dst, const Fp &src, uint64_t condition);
// Helper functions to access constants
// Get modulus reference (device: from constant memory, host: static copy)
__host__ __device__ const Fp &fp_modulus();
// Get Montgomery reduction constant p' = -p^(-1) mod 2^LIMB_BITS
__host__ __device__ UNSIGNED_LIMB fp_p_prime();
// ============================================================================
// Async/Sync API for device memory operations

View File

@@ -81,12 +81,6 @@ __host__ __device__ void fp2_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);
// NOTE: All inputs and outputs are in Montgomery form (no conversions)
__host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Montgomery squaring: c = a^2 (all in Montgomery form)
// Uses the complex-squaring identity: c0 = (a0+a1)(a0-a1), c1 = 2*a0*a1
// Only 2 Fp multiplications vs 3 for fp2_mont_mul(c, a, a).
// NOTE: All inputs and outputs are in Montgomery form (no conversions)
__host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a);
// Squaring: c = a^2
// (a0 + a1*i)^2 = (a0^2 - a1^2) + 2*a0*a1*i
// Optimized version that uses fewer multiplications
@@ -109,13 +103,6 @@ __host__ __device__ void fp2_mont_inv(Fp2 &c, const Fp2 &a);
// Division: c = a / b = a * b^(-1)
__host__ __device__ void fp2_div(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Small-constant multiplication via addition chains (much cheaper than
// fp2_mont_mul). MONTGOMERY: input and output must be in Montgomery form.
__host__ __device__ void fp2_double(Fp2 &c, const Fp2 &a);
__host__ __device__ void fp2_mul3(Fp2 &c, const Fp2 &a);
__host__ __device__ void fp2_mul4(Fp2 &c, const Fp2 &a);
__host__ __device__ void fp2_mul8(Fp2 &c, const Fp2 &a);
__host__ __device__ void fp2_cmov(Fp2 &dst, const Fp2 &src, uint64_t condition);
// Frobenius map: c = a^p

View File

@@ -14,33 +14,33 @@
// ============================================================================
// Kernel thread configuration
constexpr uint32_t KERNEL_THREADS_MAX = 256;
#define KERNEL_THREADS_MAX 256 // Maximum threads per block for general kernels
// G1 dynamic window selection thresholds
constexpr uint32_t MSM_G1_SMALL_THRESHOLD = 256; // n <= 256: use 4-bit windows
constexpr uint32_t MSM_G1_MEDIUM_THRESHOLD =
4096; // n <= 4096: use 5-bit windows
#define MSM_G1_SMALL_THRESHOLD 256 // n <= 256: use 4-bit windows
#define MSM_G1_MEDIUM_THRESHOLD 4096 // n <= 4096: use 5-bit windows
// Pippenger algorithm parameters
constexpr uint32_t MSM_G1_WINDOW_SIZE = 4; // 4-bit windows for G1
constexpr uint32_t MSM_G1_BUCKET_COUNT = 16; // 2^MSM_G1_WINDOW_SIZE buckets
#define MSM_G1_WINDOW_SIZE 4 // 4-bit windows for G1
#define MSM_G1_BUCKET_COUNT 16 // 2^MSM_G1_WINDOW_SIZE buckets (0-15)
// G2-specific parameters: larger window = fewer Horner doublings
// G2 benefits from larger windows because its field ops are 2x more expensive
constexpr uint32_t MSM_G2_WINDOW_SIZE = 5; // 5-bit windows for G2
constexpr uint32_t MSM_G2_BUCKET_COUNT = 32; // 2^MSM_G2_WINDOW_SIZE buckets
#define MSM_G2_WINDOW_SIZE 5 // 5-bit windows for G2
#define MSM_G2_BUCKET_COUNT 32 // 2^MSM_G2_WINDOW_SIZE buckets (0-31)
// Threads per block for MSM kernels (must match implementation)
// These values are used for scratch space calculation in wrappers
constexpr uint32_t MSM_G1_THREADS_PER_BLOCK = 128;
constexpr uint32_t MSM_G2_THREADS_PER_BLOCK = 128;
#define MSM_G1_THREADS_PER_BLOCK 128 // G1 uses 128 threads per block
#define MSM_G2_THREADS_PER_BLOCK \
128 // G2 uses 128 threads per block (register-based bucket accumulation)
// Helper function to get optimal threads per block for MSM based on point type.
// Uses 128 threads for both G1 and G2 for optimal SM occupancy on H100:
// - G1 with 128 threads: 15.6KB shared mem, allows 3 blocks per SM
// - G2 with 128 threads: 29.8KB shared mem, allows 1 block per SM
// Testing showed 64 threads is worse (25% slower for G2/4096).
template <typename PointType> uint32_t msm_threads_per_block(uint32_t n) {
template <typename PointType> int get_msm_threads_per_block(uint32_t n) {
(void)n;
return 128;
}
@@ -65,60 +65,39 @@ template <> struct MSMWindowSize<G2ProjectivePoint> {
static constexpr uint32_t value = MSM_G2_WINDOW_SIZE;
};
// ============================================================================
// Scratch Size Helpers
// ============================================================================
// Compute the exact scratch buffer size (in bytes) needed by the Pippenger MSM
// implementation for a given input count. These match the internal scratch
// partitioning exactly: all_block_buckets + all_final_buckets + window_sums.
// The gpu_index is needed to query device shared memory limits, which affect
// the per-window block count.
size_t pippenger_scratch_size_g1(uint32_t n, uint32_t gpu_index);
size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index);
// ============================================================================
// MSM with BigInt Scalars (320-bit scalars, default implementation)
// ============================================================================
// MSM for G1 points with BigInt scalars (projective result)
// Computes: result = sum(scalars[i] * points[i])
// Result is written directly to a host pointer (no device allocation needed for
// the result). Scratch space must be pre-allocated by the caller and passed via
// d_scratch as a typed projective pointer (G1Projective* for G1,
// G2ProjectivePoint* for G2). Use the scratch size helpers to query the
// required allocation size in bytes, then cast the allocation to the
// appropriate projective type.
// Arguments:
// stream: CUDA stream for async execution
// gpu_index: GPU device index
// h_result: Host pointer to output (projective G1 point)
// d_result: Device pointer to output (projective G1 point)
// d_points: Device pointer to input affine G1 points (array of n points)
// d_scalars: Device pointer to input BigInt scalars (array of n scalars)
// d_scratch: Device pointer to scratch buffer for intermediate results
// Required size: (num_blocks + 1) * MSM_G1_BUCKET_COUNT *
// sizeof(G1Projective)
// n: Number of points/scalars
// d_scratch: Caller-provided device scratch buffer for intermediate results
// size_tracker: Reference for tracking GPU memory allocation sizes
void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated);
void point_msm_async_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *d_result, const G1Affine *d_points,
const Scalar *d_scalars, G1Projective *d_scratch,
uint32_t n, uint64_t &size_tracker);
void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
uint64_t &size_tracker, bool gpu_memory_allocated);
G1Projective *d_result, const G1Affine *d_points,
const Scalar *d_scalars, G1Projective *d_scratch, uint32_t n,
uint64_t &size_tracker);
// MSM for G2 points with BigInt scalars (projective result)
// Result is written directly to a host pointer.
void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated);
void point_msm_async_g2(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *d_result, const G2Point *d_points,
const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
uint32_t n, uint64_t &size_tracker);
void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated);
G2ProjectivePoint *d_result, const G2Point *d_points,
const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
uint32_t n, uint64_t &size_tracker);

View File

@@ -1,278 +0,0 @@
#pragma once
#include "curve.h"
#include "fp.h"
#include "fp2.h"
// ============================================================================
// Unified Trait System for Elliptic Curve Points
// ============================================================================
// Provides compile-time dispatch for field and point operations across G1/G2.
// Both affine (curve.cu) and MSM (msm/) code use these traits instead of
// maintaining separate copies.
// Forward declarations for projective point operations (implemented in
// curve.cu)
__host__ __device__ void projective_point_add(G1Projective &result,
const G1Projective &p1,
const G1Projective &p2);
__host__ __device__ void projective_point_add(G2Projective &result,
const G2Projective &p1,
const G2Projective &p2);
__host__ __device__ void projective_point_double(G1Projective &result,
const G1Projective &p);
__host__ __device__ void projective_point_double(G2Projective &result,
const G2Projective &p);
__host__ __device__ void projective_mixed_add(G1Projective &result,
const G1Projective &p1,
const G1Affine &p2);
__host__ __device__ void projective_mixed_add(G2Projective &result,
const G2Projective &p1,
const G2Affine &p2);
// ============================================================================
// Affine<T>: trait for affine point operations
// ============================================================================
template <typename PointType> struct Affine;
template <> struct Affine<G1Affine> {
using FieldType = Fp;
__host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
c = -a;
}
__host__ __device__ static void field_add(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a + b;
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp_mont_mul(c, a, b);
}
__host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
fp_mont_inv(c, a);
}
__host__ __device__ static ComparisonType field_cmp(const FieldType &a,
const FieldType &b) {
return fp_cmp(a, b);
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp_is_zero(a);
}
__host__ __device__ static void field_to_montgomery(FieldType &c,
const FieldType &a) {
fp_to_montgomery(c, a);
}
__host__ __device__ static void field_from_montgomery(FieldType &c,
const FieldType &a) {
fp_from_montgomery(c, a);
}
__host__ __device__ static void point_at_infinity(G1Affine &point) {
g1_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G1Affine &point) {
return g1_is_infinity(point);
}
__host__ __device__ static const FieldType &curve_b() { return curve_b_g1(); }
__host__ __device__ static void point_copy(G1Affine &dst,
const G1Affine &src) {
dst = src;
}
};
template <> struct Affine<G2Affine> {
using FieldType = Fp2;
__host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
c = -a;
}
__host__ __device__ static void field_add(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a + b;
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp2_mont_mul(c, a, b);
}
__host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
fp2_mont_inv(c, a);
}
__host__ __device__ static ComparisonType field_cmp(const FieldType &a,
const FieldType &b) {
return fp2_cmp(a, b);
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp2_is_zero(a);
}
__host__ __device__ static void field_to_montgomery(FieldType &c,
const FieldType &a) {
fp_to_montgomery(c.c0, a.c0);
fp_to_montgomery(c.c1, a.c1);
}
__host__ __device__ static void field_from_montgomery(FieldType &c,
const FieldType &a) {
fp_from_montgomery(c.c0, a.c0);
fp_from_montgomery(c.c1, a.c1);
}
__host__ __device__ static void point_at_infinity(G2Affine &point) {
g2_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G2Affine &point) {
return g2_is_infinity(point);
}
__host__ __device__ static const FieldType &curve_b() { return curve_b_g2(); }
__host__ __device__ static void point_copy(G2Affine &dst,
const G2Affine &src) {
dst = src;
}
};
// ============================================================================
// Projective<T>: trait for projective point operations
// ============================================================================
// Includes mixed_add() for efficient projective + affine addition used by MSM.
template <typename PointType> struct Projective;
template <> struct Projective<G1Projective> {
using FieldType = Fp;
using AffineType = G1Affine;
__host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp_is_zero(a);
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp_mont_mul(c, a, b);
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void point_at_infinity(G1Projective &point) {
g1_projective_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G1Projective &point) {
return fp_is_zero(point.Z);
}
__host__ __device__ static void affine_to_projective(G1Projective &proj,
const G1Affine &affine) {
::affine_to_projective(proj, affine);
}
__host__ __device__ static void projective_add(G1Projective &result,
const G1Projective &p1,
const G1Projective &p2) {
projective_point_add(result, p1, p2);
}
__host__ __device__ static void projective_double(G1Projective &result,
const G1Projective &p) {
projective_point_double(result, p);
}
__host__ __device__ static void
mixed_add(G1Projective &result, const G1Projective &p1, const G1Affine &p2) {
projective_mixed_add(result, p1, p2);
}
__host__ __device__ static void point_copy(G1Projective &dst,
const G1Projective &src) {
dst = src;
}
};
template <> struct Projective<G2Projective> {
using FieldType = Fp2;
using AffineType = G2Affine;
__host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp2_is_zero(a);
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp2_mont_mul(c, a, b);
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void point_at_infinity(G2Projective &point) {
g2_projective_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G2Projective &point) {
return fp2_is_zero(point.Z);
}
__host__ __device__ static void affine_to_projective(G2Projective &proj,
const G2Affine &affine) {
::affine_to_projective(proj, affine);
}
__host__ __device__ static void projective_add(G2Projective &result,
const G2Projective &p1,
const G2Projective &p2) {
projective_point_add(result, p1, p2);
}
__host__ __device__ static void projective_double(G2Projective &result,
const G2Projective &p) {
projective_point_double(result, p);
}
__host__ __device__ static void
mixed_add(G2Projective &result, const G2Projective &p1, const G2Affine &p2) {
projective_mixed_add(result, p1, p2);
}
__host__ __device__ static void point_copy(G2Projective &dst,
const G2Projective &src) {
dst = src;
}
};
// ============================================================================
// SelectorChooser<T>: maps any point type to its trait struct
// ============================================================================
template <typename PointType> struct SelectorChooser;
template <> struct SelectorChooser<G1Affine> {
using Selection = Affine<G1Affine>;
};
template <> struct SelectorChooser<G2Affine> {
using Selection = Affine<G2Affine>;
};
template <> struct SelectorChooser<G1Projective> {
using Selection = Projective<G1Projective>;
};
template <> struct SelectorChooser<G2Projective> {
using Selection = Projective<G2Projective>;
};

View File

@@ -3,10 +3,239 @@
#include "fp.h"
#include "fp2.h"
#include "msm.h"
#include "point_traits.h"
#include <cstdio>
#include <cstring>
// ============================================================================
// Template Traits System for Affine Operations
// ============================================================================
// This traits system allows us to write generic point operations that work
// for both G1 (Fp) and G2 (Fp2) points using the same algorithm.
template <typename PointType> struct Affine;
// Specialization for G1Point (uses Fp)
template <> struct Affine<G1Affine> {
using Field = Fp;
__host__ __device__ static void field_zero(Field &a) { fp_zero(a); }
__host__ __device__ static void field_copy(Field &dst, const Field &src) {
dst = src;
}
__host__ __device__ static void field_neg(Field &c, const Field &a) {
c = -a;
}
__host__ __device__ static void field_add(Field &c, const Field &a,
const Field &b) {
c = a + b;
}
__host__ __device__ static void field_sub(Field &c, const Field &a,
const Field &b) {
c = a - b;
}
__host__ __device__ static void field_mul(Field &c, const Field &a,
const Field &b) {
fp_mont_mul(c, a, b);
}
__host__ __device__ static void field_inv(Field &c, const Field &a) {
fp_mont_inv(c, a);
}
__host__ __device__ static ComparisonType field_cmp(const Field &a,
const Field &b) {
return fp_cmp(a, b);
}
__host__ __device__ static bool field_is_zero(const Field &a) {
return fp_is_zero(a);
}
__host__ __device__ static void field_to_montgomery(Field &c,
const Field &a) {
fp_to_montgomery(c, a);
}
__host__ __device__ static void field_from_montgomery(Field &c,
const Field &a) {
fp_from_montgomery(c, a);
}
__host__ __device__ static void point_at_infinity(G1Affine &point) {
g1_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G1Affine &point) {
return g1_is_infinity(point);
}
__host__ __device__ static const Field &curve_b() { return curve_b_g1(); }
__host__ __device__ static void point_copy(G1Affine &dst,
const G1Affine &src) {
dst = src;
}
};
// Specialization for G2Affine (uses Fp2)
template <> struct Affine<G2Affine> {
using Field = Fp2;
__host__ __device__ static void field_zero(Field &a) { fp2_zero(a); }
__host__ __device__ static void field_copy(Field &dst, const Field &src) {
dst = src;
}
__host__ __device__ static void field_neg(Field &c, const Field &a) {
c = -a;
}
__host__ __device__ static void field_add(Field &c, const Field &a,
const Field &b) {
c = a + b;
}
__host__ __device__ static void field_sub(Field &c, const Field &a,
const Field &b) {
c = a - b;
}
__host__ __device__ static void field_mul(Field &c, const Field &a,
const Field &b) {
fp2_mont_mul(c, a, b);
}
__host__ __device__ static void field_inv(Field &c, const Field &a) {
fp2_mont_inv(c, a);
}
__host__ __device__ static ComparisonType field_cmp(const Field &a,
const Field &b) {
return fp2_cmp(a, b);
}
__host__ __device__ static bool field_is_zero(const Field &a) {
return fp2_is_zero(a);
}
__host__ __device__ static void field_to_montgomery(Field &c,
const Field &a) {
fp_to_montgomery(c.c0, a.c0);
fp_to_montgomery(c.c1, a.c1);
}
__host__ __device__ static void field_from_montgomery(Field &c,
const Field &a) {
fp_from_montgomery(c.c0, a.c0);
fp_from_montgomery(c.c1, a.c1);
}
__host__ __device__ static void point_at_infinity(G2Affine &point) {
g2_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G2Affine &point) {
return g2_is_infinity(point);
}
__host__ __device__ static const Field &curve_b() { return curve_b_g2(); }
__host__ __device__ static void point_copy(G2Affine &dst,
const G2Affine &src) {
dst = src;
}
};
// Forward declarations for projective point operations (needed by Projective)
__host__ __device__ void projective_point_add(G1Projective &result,
const G1Projective &p1,
const G1Projective &p2);
__host__ __device__ void projective_point_add(G2Projective &result,
const G2Projective &p1,
const G2Projective &p2);
__host__ __device__ void projective_point_double(G1Projective &result,
const G1Projective &p);
__host__ __device__ void projective_point_double(G2Projective &result,
const G2Projective &p);
// ============================================================================
// Template Traits System for Projective Points
// ============================================================================
template <typename PointType> struct Projective;
// Specialization for G1Projective (uses Fp)
template <> struct Projective<G1Projective> {
using Field = Fp;
using Affine = G1Affine;
__host__ __device__ static void field_zero(Field &a) { fp_zero(a); }
__host__ __device__ static void field_copy(Field &dst, const Field &src) {
dst = src;
}
__host__ __device__ static bool field_is_zero(const Field &a) {
return fp_is_zero(a);
}
__host__ __device__ static void field_mul(Field &c, const Field &a,
const Field &b) {
fp_mont_mul(c, a, b);
}
__host__ __device__ static void field_sub(Field &c, const Field &a,
const Field &b) {
c = a - b;
}
__host__ __device__ static void point_at_infinity(G1Projective &point) {
g1_projective_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G1Projective &point) {
return fp_is_zero(point.Z);
}
__host__ __device__ static void affine_to_projective(G1Projective &proj,
const G1Affine &affine) {
affine_to_projective(proj, affine);
}
__host__ __device__ static void projective_add(G1Projective &result,
const G1Projective &p1,
const G1Projective &p2) {
projective_point_add(result, p1, p2);
}
__host__ __device__ static void projective_double(G1Projective &result,
const G1Projective &p) {
projective_point_double(result, p);
}
__host__ __device__ static void point_copy(G1Projective &dst,
const G1Projective &src) {
dst = src;
}
};
// Specialization for G2Projective (uses Fp2)
template <> struct Projective<G2Projective> {
using Field = Fp2;
using Affine = G2Affine;
__host__ __device__ static void field_zero(Field &a) { fp2_zero(a); }
__host__ __device__ static void field_copy(Field &dst, const Field &src) {
dst = src;
}
__host__ __device__ static bool field_is_zero(const Field &a) {
return fp2_is_zero(a);
}
__host__ __device__ static void field_mul(Field &c, const Field &a,
const Field &b) {
fp2_mont_mul(c, a, b);
}
__host__ __device__ static void field_sub(Field &c, const Field &a,
const Field &b) {
c = a - b;
}
__host__ __device__ static void point_at_infinity(G2Projective &point) {
g2_projective_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G2Projective &point) {
return fp2_is_zero(point.Z);
}
__host__ __device__ static void affine_to_projective(G2Projective &proj,
const G2Affine &affine) {
affine_to_projective(proj, affine);
}
__host__ __device__ static void projective_add(G2Projective &result,
const G2Projective &p1,
const G2Projective &p2) {
projective_point_add(result, p1, p2);
}
__host__ __device__ static void projective_double(G2Projective &result,
const G2Projective &p) {
projective_point_double(result, p);
}
__host__ __device__ static void point_copy(G2Projective &dst,
const G2Projective &src) {
dst = src;
}
};
// ============================================================================
// Template Scalar Multiplication for Projective Points
// ============================================================================
@@ -294,7 +523,7 @@ __host__ __device__ void point_neg(PointType &result, const PointType &p) {
template <typename PointType>
__host__ __device__ void point_double(PointType &result, const PointType &p) {
using AffinePoint = Affine<PointType>;
using FieldType = typename AffinePoint::FieldType;
using FieldType = typename AffinePoint::Field;
if (AffinePoint::is_infinity(p) || AffinePoint::field_is_zero(p.y)) {
AffinePoint::point_at_infinity(result);
@@ -333,7 +562,7 @@ template <typename PointType>
__host__ __device__ void point_add(PointType &result, const PointType &p1,
const PointType &p2) {
using AffinePoint = Affine<PointType>;
using FieldType = typename AffinePoint::FieldType;
using Field = typename AffinePoint::Field;
// Handle infinity cases
if (AffinePoint::is_infinity(p1)) {
@@ -346,7 +575,7 @@ __host__ __device__ void point_add(PointType &result, const PointType &p1,
}
// Check if p1 == -p2 (same x, opposite y)
FieldType neg_y2;
Field neg_y2;
AffinePoint::field_neg(neg_y2, p2.y);
if (AffinePoint::field_cmp(p1.x, p2.x) == ComparisonType::Equal &&
AffinePoint::field_cmp(p1.y, neg_y2) == ComparisonType::Equal) {
@@ -362,7 +591,7 @@ __host__ __device__ void point_add(PointType &result, const PointType &p1,
}
// Standard addition: lambda = (y2 - y1) / (x2 - x1)
FieldType dx, dy, lambda, lambda_squared, x_result;
Field dx, dy, lambda, lambda_squared, x_result;
AffinePoint::field_sub(dx, p2.x, p1.x);
AffinePoint::field_sub(dy, p2.y, p1.y);
AffinePoint::field_inv(lambda, dx); // 1 / (x2 - x1)
@@ -374,7 +603,7 @@ __host__ __device__ void point_add(PointType &result, const PointType &p1,
AffinePoint::field_sub(x_result, x_result, p2.x);
// y_result = lambda * (x1 - x_result) - y1
FieldType x1_minus_xr, y_result;
Field x1_minus_xr, y_result;
AffinePoint::field_sub(x1_minus_xr, p1.x, x_result);
AffinePoint::field_mul(y_result, lambda, x1_minus_xr);
AffinePoint::field_sub(y_result, y_result, p1.y);
@@ -494,6 +723,76 @@ __host__ __device__ const Fp2 &curve_b_g2() {
#endif
}
// ============================================================================
// Cached Montgomery Form Constants for Curve Operations
// ============================================================================
// These functions return references to cached Montgomery form constants
// to avoid recomputing them on every projective point operation call.
// For host code: uses static locals (thread-safe in C++11)
// For device code: computes once per call (cached via output parameter)
// Helper struct to hold cached Fp Montgomery constants
struct FpMontConstants {
Fp two;
Fp three;
Fp four;
Fp eight;
};
// Helper struct to hold cached Fp2 Montgomery constants
struct Fp2MontConstants {
Fp2 two;
Fp2 three;
Fp2 four;
Fp2 eight;
};
// Get cached Fp Montgomery constants (for host code)
__host__ const FpMontConstants &get_fp_mont_constants_host() {
static FpMontConstants constants = []() {
FpMontConstants c;
fp_two_montgomery(c.two);
fp_three_montgomery(c.three);
fp_four_montgomery(c.four);
fp_eight_montgomery(c.eight);
return c;
}();
return constants;
}
// Get cached Fp2 Montgomery constants (for host code)
__host__ const Fp2MontConstants &get_fp2_mont_constants_host() {
static Fp2MontConstants constants = []() {
Fp2MontConstants c;
fp2_two_montgomery(c.two);
fp2_three_montgomery(c.three);
fp2_four_montgomery(c.four);
fp2_eight_montgomery(c.eight);
return c;
}();
return constants;
}
// Initialize Fp Montgomery constants (for device code, called once per
// function)
__device__ void init_fp_mont_constants(Fp &two, Fp &three, Fp &four,
Fp &eight) {
fp_two_montgomery(two);
fp_three_montgomery(three);
fp_four_montgomery(four);
fp_eight_montgomery(eight);
}
// Initialize Fp2 Montgomery constants (for device code, called once per
// function)
__device__ void init_fp2_mont_constants(Fp2 &two, Fp2 &three, Fp2 &four,
Fp2 &eight) {
fp2_two_montgomery(two);
fp2_three_montgomery(three);
fp2_four_montgomery(four);
fp2_eight_montgomery(eight);
}
// Check if a G1 point is on the curve: y^2 = x^3 + b
// Uses Montgomery form internally for efficiency
__host__ __device__ bool is_on_curve_g1(const G1Affine &point) {
@@ -544,11 +843,11 @@ __host__ __device__ bool is_on_curve_g2(const G2Affine &point) {
// Compute y^2 in Montgomery form
Fp2 y_squared_mont;
fp2_mont_square(y_squared_mont, y_mont);
fp2_mont_mul(y_squared_mont, y_mont, y_mont);
// Compute x^3 in Montgomery form
Fp2 x_squared_mont, x_cubed_mont;
fp2_mont_square(x_squared_mont, x_mont);
fp2_mont_mul(x_squared_mont, x_mont, x_mont);
fp2_mont_mul(x_cubed_mont, x_squared_mont, x_mont);
// Compute x^3 + b' in Montgomery form
@@ -1439,7 +1738,14 @@ __host__ __device__ void projective_point_add(G1Projective &result,
Fp temp1, two_R;
fp_mont_mul(temp1, uu, Z1Z2);
Fp temp2 = temp1 - vvv;
fp_double(two_R, R);
// Compute 2*R using cached Montgomery constant
Fp two_mont;
#ifdef __CUDA_ARCH__
fp_two_montgomery(two_mont);
#else
two_mont = get_fp_mont_constants_host().two;
#endif
fp_mont_mul(two_R, two_mont, R);
A = temp2 - two_R;
// X3 = v * A
@@ -1483,7 +1789,7 @@ __host__ __device__ void projective_point_add(G2Projective &result,
fp2_mont_mul(Y2Z1, p2.Y, p1.Z);
u = Y2Z1 - Y1Z2;
fp2_mont_square(uu, u);
fp2_mont_mul(uu, u, u);
Fp2 X2Z1;
fp2_mont_mul(X2Z1, p2.X, p1.Z);
@@ -1495,7 +1801,7 @@ __host__ __device__ void projective_point_add(G2Projective &result,
return;
}
fp2_mont_square(vv, v);
fp2_mont_mul(vv, v, v);
fp2_mont_mul(vvv, v, vv);
fp2_mont_mul(R, vv, X1Z2);
@@ -1504,7 +1810,14 @@ __host__ __device__ void projective_point_add(G2Projective &result,
Fp2 temp1, two_R;
fp2_mont_mul(temp1, uu, Z1Z2);
Fp2 temp2 = temp1 - vvv;
fp2_double(two_R, R);
// Compute 2*R using cached Montgomery constant
Fp2 two_mont;
#ifdef __CUDA_ARCH__
fp2_two_montgomery(two_mont);
#else
two_mont = get_fp2_mont_constants_host().two;
#endif
fp2_mont_mul(two_R, two_mont, R);
A = temp2 - two_R;
fp2_mont_mul(result.X, v, A);
@@ -1581,7 +1894,14 @@ __host__ __device__ void projective_mixed_add(G1Projective &result,
Fp temp1, two_R;
fp_mont_mul(temp1, uu, p1.Z);
Fp temp2 = temp1 - vvv;
fp_double(two_R, R);
// Compute 2*R
Fp two_mont;
#ifdef __CUDA_ARCH__
fp_two_montgomery(two_mont);
#else
two_mont = get_fp_mont_constants_host().two;
#endif
fp_mont_mul(two_R, two_mont, R);
A = temp2 - two_R;
// X3 = v * A
@@ -1647,8 +1967,8 @@ __host__ __device__ void projective_mixed_add(G2Projective &result,
}
// uu = u^2, vv = v^2, vvv = v * vv
fp2_mont_square(uu, u);
fp2_mont_square(vv, v);
fp2_mont_mul(uu, u, u);
fp2_mont_mul(vv, v, v);
fp2_mont_mul(vvv, v, vv);
// R = vv * X1
@@ -1658,7 +1978,13 @@ __host__ __device__ void projective_mixed_add(G2Projective &result,
Fp2 temp1, two_R;
fp2_mont_mul(temp1, uu, p1.Z);
Fp2 temp2 = temp1 - vvv;
fp2_double(two_R, R);
Fp2 two_mont;
#ifdef __CUDA_ARCH__
fp2_two_montgomery(two_mont);
#else
two_mont = get_fp2_mont_constants_host().two;
#endif
fp2_mont_mul(two_R, two_mont, R);
A = temp2 - two_R;
// X3 = v * A
@@ -1690,10 +2016,22 @@ __host__ __device__ void projective_point_double(G1Projective &result,
// G1 projective doubling using hyperelliptic.org formula
// For curves y^2 = x^3 + a_4*x + b with a_4 = 0
// Get Montgomery constants (cached for host, computed once for device)
Fp two_mont, three_mont, four_mont, eight_mont;
#ifdef __CUDA_ARCH__
init_fp_mont_constants(two_mont, three_mont, four_mont, eight_mont);
#else
const FpMontConstants &c = get_fp_mont_constants_host();
two_mont = c.two;
three_mont = c.three;
four_mont = c.four;
eight_mont = c.eight;
#endif
// A = 3 * X^2
Fp X_sq, A;
fp_mont_mul(X_sq, p.X, p.X);
fp_mul3(A, X_sq);
fp_mont_mul(A, three_mont, X_sq);
// B = Y * Z
Fp B;
@@ -1707,17 +2045,17 @@ __host__ __device__ void projective_point_double(G1Projective &result,
// D = A^2 - 8*C
Fp A_sq, eight_C;
fp_mont_mul(A_sq, A, A);
fp_mul8(eight_C, C);
fp_mont_mul(eight_C, eight_mont, C);
Fp D = A_sq - eight_C;
// X3 = 2 * B * D
// X = 2 * B * D
Fp BD;
fp_mont_mul(BD, B, D);
fp_double(result.X, BD);
fp_mont_mul(result.X, two_mont, BD);
// Y3 = A * (4*C - D) - 8 * Y^2 * B^2
// Y = A * (4*C - D) - 8 * Y^2 * B^2
Fp four_C, A_times_diff;
fp_mul4(four_C, C);
fp_mont_mul(four_C, four_mont, C);
Fp four_C_minus_D = four_C - D;
fp_mont_mul(A_times_diff, A, four_C_minus_D);
@@ -1725,13 +2063,13 @@ __host__ __device__ void projective_point_double(G1Projective &result,
fp_mont_mul(Y_sq, p.Y, p.Y);
fp_mont_mul(B_sq, B, B);
fp_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
fp_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
fp_mont_mul(eight_Y_sq_B_sq, eight_mont, Y_sq_B_sq);
result.Y = A_times_diff - eight_Y_sq_B_sq;
// Z3 = 8 * B^3
// Z = 8 * B^3
Fp B_cu;
fp_mont_mul(B_cu, B_sq, B);
fp_mul8(result.Z, B_cu);
fp_mont_mul(result.Z, eight_mont, B_cu);
}
// Projective point doubling: result = 2 * p (no inversions!) - G2
@@ -1747,10 +2085,22 @@ __host__ __device__ void projective_point_double(G2Projective &result,
// G2 projective doubling (same as G1 but with Fp2)
// Get Montgomery constants (cached for host, computed once for device)
Fp2 two_mont, three_mont, four_mont, eight_mont;
#ifdef __CUDA_ARCH__
init_fp2_mont_constants(two_mont, three_mont, four_mont, eight_mont);
#else
const Fp2MontConstants &c = get_fp2_mont_constants_host();
two_mont = c.two;
three_mont = c.three;
four_mont = c.four;
eight_mont = c.eight;
#endif
// A = 3 * X^2
Fp2 X_sq, A;
fp2_mont_square(X_sq, p.X);
fp2_mul3(A, X_sq);
fp2_mont_mul(X_sq, p.X, p.X);
fp2_mont_mul(A, three_mont, X_sq);
// B = Y * Z
Fp2 B;
@@ -1763,32 +2113,32 @@ __host__ __device__ void projective_point_double(G2Projective &result,
// D = A^2 - 8*C
Fp2 A_sq, eight_C;
fp2_mont_square(A_sq, A);
fp2_mul8(eight_C, C);
fp2_mont_mul(A_sq, A, A);
fp2_mont_mul(eight_C, eight_mont, C);
Fp2 D = A_sq - eight_C;
// X3 = 2 * B * D
// X = 2 * B * D
Fp2 BD;
fp2_mont_mul(BD, B, D);
fp2_double(result.X, BD);
fp2_mont_mul(result.X, two_mont, BD);
// Y3 = A * (4*C - D) - 8 * Y^2 * B^2
// Y = A * (4*C - D) - 8 * Y^2 * B^2
Fp2 four_C, A_times_diff;
fp2_mul4(four_C, C);
fp2_mont_mul(four_C, four_mont, C);
Fp2 four_C_minus_D = four_C - D;
fp2_mont_mul(A_times_diff, A, four_C_minus_D);
Fp2 Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
fp2_mont_square(Y_sq, p.Y);
fp2_mont_square(B_sq, B);
fp2_mont_mul(Y_sq, p.Y, p.Y);
fp2_mont_mul(B_sq, B, B);
fp2_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
fp2_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
fp2_mont_mul(eight_Y_sq_B_sq, eight_mont, Y_sq_B_sq);
result.Y = A_times_diff - eight_Y_sq_B_sq;
// Z3 = 8 * B^3
// Z = 8 * B^3
Fp2 B_cu;
fp2_mont_mul(B_cu, B_sq, B);
fp2_mul8(result.Z, B_cu);
fp2_mont_mul(result.Z, eight_mont, B_cu);
}
// Explicit template instantiations for projective_scalar_mul (needed by MSM)

View File

@@ -1,15 +1,286 @@
#pragma once
#include "curve.h"
#include "fp.h"
#include "fp2.h"
#include "point_traits.h"
// Forward declarations for projective point operations (implemented in
// curve.cu)
__host__ __device__ void projective_point_add(G1Projective &result,
const G1Projective &p1,
const G1Projective &p2);
__host__ __device__ void projective_point_add(G2Projective &result,
const G2Projective &p1,
const G2Projective &p2);
__host__ __device__ void projective_point_double(G1Projective &result,
const G1Projective &p);
__host__ __device__ void projective_point_double(G2Projective &result,
const G2Projective &p);
// Mixed addition: projective + affine (saves 3 field muls vs
// projective+projective)
__host__ __device__ void projective_mixed_add(G1Projective &result,
const G1Projective &p1,
const G1Affine &p2);
__host__ __device__ void projective_mixed_add(G2Projective &result,
const G2Projective &p1,
const G2Affine &p2);
// Multi-Scalar Multiplication (MSM) common code
// Template traits used by MSM algorithms
// Note: projective_point_add and projective_point_double are declared in
// curve.h
// ============================================================================
// Template Traits (needed by MSM kernels)
// ============================================================================
template <typename PointType> struct PointSelector;
// Specialization for G1Point (uses Fp)
template <> struct PointSelector<G1Affine> {
using FieldType = Fp;
__host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
c = -a;
}
__host__ __device__ static void field_add(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a + b;
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp_mont_mul(c, a, b);
}
__host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
fp_mont_inv(c, a);
}
__host__ __device__ static ComparisonType field_cmp(const FieldType &a,
const FieldType &b) {
return fp_cmp(a, b);
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp_is_zero(a);
}
__host__ __device__ static void field_to_montgomery(FieldType &c,
const FieldType &a) {
fp_to_montgomery(c, a);
}
__host__ __device__ static void field_from_montgomery(FieldType &c,
const FieldType &a) {
fp_from_montgomery(c, a);
}
__host__ __device__ static void point_at_infinity(G1Affine &point) {
g1_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G1Affine &point) {
return g1_is_infinity(point);
}
__host__ __device__ static const FieldType &curve_b() { return curve_b_g1(); }
__host__ __device__ static void point_copy(G1Affine &dst,
const G1Affine &src) {
dst = src;
}
};
// Specialization for G2Point (uses Fp2)
template <> struct PointSelector<G2Point> {
using FieldType = Fp2;
__host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
c = -a;
}
__host__ __device__ static void field_add(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a + b;
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp2_mont_mul(c, a, b);
}
__host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
fp2_mont_inv(c, a);
}
__host__ __device__ static ComparisonType field_cmp(const FieldType &a,
const FieldType &b) {
return fp2_cmp(a, b);
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp2_is_zero(a);
}
__host__ __device__ static void field_to_montgomery(FieldType &c,
const FieldType &a) {
fp_to_montgomery(c.c0, a.c0);
fp_to_montgomery(c.c1, a.c1);
}
__host__ __device__ static void field_from_montgomery(FieldType &c,
const FieldType &a) {
fp_from_montgomery(c.c0, a.c0);
fp_from_montgomery(c.c1, a.c1);
}
__host__ __device__ static void point_at_infinity(G2Point &point) {
g2_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G2Point &point) {
return g2_is_infinity(point);
}
__host__ __device__ static const FieldType &curve_b() { return curve_b_g2(); }
__host__ __device__ static void point_copy(G2Point &dst, const G2Point &src) {
dst = src;
}
};
template <typename ProjectiveType> struct ProjectiveSelector;
// Specialization for G1Projective (uses Fp)
template <> struct ProjectiveSelector<G1Projective> {
using FieldType = Fp;
using AffineType = G1Affine;
__host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp_is_zero(a);
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp_mont_mul(c, a, b);
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void point_at_infinity(G1Projective &point) {
g1_projective_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G1Projective &point) {
return fp_is_zero(point.Z);
}
__host__ __device__ static void affine_to_projective(G1Projective &proj,
const G1Affine &affine) {
::affine_to_projective(proj, affine);
}
__host__ __device__ static void projective_add(G1Projective &result,
const G1Projective &p1,
const G1Projective &p2) {
projective_point_add(result, p1, p2);
}
__host__ __device__ static void projective_double(G1Projective &result,
const G1Projective &p) {
projective_point_double(result, p);
}
// Mixed addition: adds affine point to projective (saves 3 field muls)
__host__ __device__ static void
mixed_add(G1Projective &result, const G1Projective &p1, const G1Affine &p2) {
projective_mixed_add(result, p1, p2);
}
__host__ __device__ static void point_copy(G1Projective &dst,
const G1Projective &src) {
dst = src;
}
};
// Specialization for G2ProjectivePoint (uses Fp2)
// Note: G2ProjectivePoint is a type alias for G2Projective
template <> struct ProjectiveSelector<G2ProjectivePoint> {
using FieldType = Fp2;
using AffineType = G2Point;
__host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
__host__ __device__ static void field_copy(FieldType &dst,
const FieldType &src) {
dst = src;
}
__host__ __device__ static bool field_is_zero(const FieldType &a) {
return fp2_is_zero(a);
}
__host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
const FieldType &b) {
fp2_mont_mul(c, a, b);
}
__host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
const FieldType &b) {
c = a - b;
}
__host__ __device__ static void point_at_infinity(G2ProjectivePoint &point) {
g2_projective_point_at_infinity(point);
}
__host__ __device__ static bool is_infinity(const G2ProjectivePoint &point) {
return fp2_is_zero(point.Z);
}
__host__ __device__ static void affine_to_projective(G2ProjectivePoint &proj,
const G2Point &affine) {
::affine_to_projective(proj, affine);
}
__host__ __device__ static void projective_add(G2ProjectivePoint &result,
const G2ProjectivePoint &p1,
const G2ProjectivePoint &p2) {
projective_point_add(result, p1, p2);
}
__host__ __device__ static void
projective_double(G2ProjectivePoint &result, const G2ProjectivePoint &p) {
projective_point_double(result, p);
}
// Mixed addition: adds affine point to projective (saves 3 field muls)
__host__ __device__ static void mixed_add(G2ProjectivePoint &result,
const G2ProjectivePoint &p1,
const G2Point &p2) {
projective_mixed_add(result, p1, p2);
}
__host__ __device__ static void point_copy(G2ProjectivePoint &dst,
const G2ProjectivePoint &src) {
dst = src;
}
};
// ============================================================================
// MSM Kernel Templates (defined here so they're visible when instantiated)
// ============================================================================
// Helper to select appropriate selector for a point type (affine or projective)
template <typename PointType> struct SelectorChooser;
template <> struct SelectorChooser<G1Affine> {
using Selection = PointSelector<G1Affine>;
};
template <> struct SelectorChooser<G2Point> {
using Selection = PointSelector<G2Point>;
};
template <> struct SelectorChooser<G1Projective> {
using Selection = ProjectiveSelector<G1Projective>;
};
template <> struct SelectorChooser<G2ProjectivePoint> {
using Selection = ProjectiveSelector<G2ProjectivePoint>;
};
// Pippenger kernel: Clear buckets (works for both affine and projective points)
template <typename PointType>
__global__ void kernel_clear_buckets(PointType *__restrict__ buckets,
uint32_t num_buckets) {
__global__ void kernel_clear_buckets(PointType *buckets, uint32_t num_buckets) {
using AffinePoint = typename SelectorChooser<PointType>::Selection;
uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -22,11 +293,11 @@ __global__ void kernel_clear_buckets(PointType *__restrict__ buckets,
// blocks OPTIMIZED: Uses parallel tree reduction instead of sequential loop
// Launch config: <<<num_buckets, min(num_blocks, 256), shared_mem>>>
template <typename ProjectiveType>
__global__ void
kernel_reduce_buckets(ProjectiveType *__restrict__ final_buckets,
const ProjectiveType *__restrict__ block_buckets,
uint32_t num_blocks, uint32_t num_buckets) {
using ProjectivePoint = Projective<ProjectiveType>;
__global__ void kernel_reduce_buckets(ProjectiveType *final_buckets,
const ProjectiveType *block_buckets,
uint32_t num_blocks,
uint32_t num_buckets) {
using ProjectivePoint = ProjectiveSelector<ProjectiveType>;
// Each block handles one bucket, threads cooperate to reduce all block
// contributions

View File

@@ -8,64 +8,55 @@
// Multi-Scalar Multiplication (MSM) using Pippenger algorithm for BLS12-446
// Forward declarations for Pippenger implementations
void point_msm_g1_pippenger_async(
cudaStream_t stream, uint32_t gpu_index, G1Projective *h_result,
const G1Affine *d_points, const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch, uint64_t &size_tracker, bool gpu_memory_allocated);
void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result,
void point_msm_async_g1_pippenger(cudaStream_t stream, uint32_t gpu_index,
G1Projective *d_result,
const G1Affine *d_points,
const Scalar *d_scalars,
G1Projective *d_scratch, uint32_t n,
uint64_t &size_tracker);
void point_msm_async_g2_pippenger(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *d_result,
const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch,
uint64_t &size_tracker,
bool gpu_memory_allocated);
const Scalar *d_scalars,
G2ProjectivePoint *d_scratch, uint32_t n,
uint64_t &size_tracker);
// ============================================================================
// Public MSM API for BigInt scalars
// ============================================================================
// MSM with BigInt scalars for G1 (projective coordinates internally)
// Result is written directly to the host pointer h_result.
void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
point_msm_g1_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
n, d_scratch, size_tracker,
gpu_memory_allocated);
void point_msm_async_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *d_result, const G1Affine *d_points,
const Scalar *d_scalars, G1Projective *d_scratch,
uint32_t n, uint64_t &size_tracker) {
point_msm_async_g1_pippenger(stream, gpu_index, d_result, d_points, d_scalars,
d_scratch, n, size_tracker);
}
// MSM with BigInt scalars for G2 (projective coordinates internally)
// Result is written directly to the host pointer h_result.
void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
point_msm_g2_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
n, d_scratch, size_tracker,
gpu_memory_allocated);
void point_msm_async_g2(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *d_result, const G2Point *d_points,
const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
uint32_t n, uint64_t &size_tracker) {
point_msm_async_g2_pippenger(stream, gpu_index, d_result, d_points, d_scalars,
d_scratch, n, size_tracker);
}
void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
uint64_t &size_tracker, bool gpu_memory_allocated) {
point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
d_scratch, size_tracker, gpu_memory_allocated);
// The async impl already syncs internally before the CPU-side Horner phase,
// so the stream is idle here. This sync is kept for defensive correctness.
G1Projective *d_result, const G1Affine *d_points,
const Scalar *d_scalars, G1Projective *d_scratch, uint32_t n,
uint64_t &size_tracker) {
point_msm_async_g1(stream, gpu_index, d_result, d_points, d_scalars,
d_scratch, n, size_tracker);
cuda_synchronize_stream(stream, gpu_index);
}
void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
d_scratch, size_tracker, gpu_memory_allocated);
// See comment in point_msm_g1 above.
G2ProjectivePoint *d_result, const G2Point *d_points,
const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
uint32_t n, uint64_t &size_tracker) {
point_msm_async_g2(stream, gpu_index, d_result, d_points, d_scalars,
d_scratch, n, size_tracker);
cuda_synchronize_stream(stream, gpu_index);
}

View File

@@ -1,12 +1,11 @@
#include "../common.cuh"
#include "checked_arithmetic.h"
#include "curve.h"
#include "device.h"
#include "fp.h"
#include "fp2.h"
#include "msm.h"
#include <algorithm>
#include <type_traits>
#include <cstring>
#include <vector>
// ============================================================================
@@ -47,11 +46,6 @@ template <typename AffineType> struct Phase1KernelLaunchParams {
adjusted_threads_per_block =
std::min(requested_threads_per_block, max_threads_for_shared_mem);
PANIC_IF_FALSE(adjusted_threads_per_block > 0,
"Phase1KernelLaunchParams: insufficient shared memory for "
"kernel launch (max_shared=%u, fixed=%zu)",
max_shared_mem_per_block, fixed_shared_mem);
// Calculate number of blocks per window
num_blocks_per_window = CEIL_DIV(n, adjusted_threads_per_block);
@@ -77,24 +71,16 @@ template <typename ProjectiveType> struct Phase2KernelLaunchParams {
// Cap threads to respect shared memory limit
uint32_t threads = std::min(requested_threads, max_threads_for_shared);
threads = std::min(threads, static_cast<uint32_t>(KERNEL_THREADS_MAX));
threads = std::min(threads, (uint32_t)KERNEL_THREADS_MAX);
// Round up to nearest power of 2 (required for tree reduction)
uint32_t pow2_threads = 1;
while (pow2_threads < threads)
pow2_threads *= 2;
// After rounding to power of 2, verify shared memory doesn't exceed device
// limit
if (safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(pow2_threads)) >
max_shared_mem_per_block) {
pow2_threads /= 2;
}
adjusted_threads = pow2_threads;
// Calculate actual shared memory requirement
shared_mem =
safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(adjusted_threads));
shared_mem = adjusted_threads * sizeof(ProjectiveType);
}
};
@@ -153,6 +139,18 @@ __device__ __forceinline__ uint32_t extract_window_bigint(
window_size);
}
// Forward declarations for projective point operations (needed by kernels)
__host__ __device__ void projective_point_add(G1Projective &result,
const G1Projective &p1,
const G1Projective &p2);
__host__ __device__ void projective_point_add(G2ProjectivePoint &result,
const G2ProjectivePoint &p1,
const G2ProjectivePoint &p2);
__host__ __device__ void projective_point_double(G1Projective &result,
const G1Projective &p);
__host__ __device__ void projective_point_double(G2ProjectivePoint &result,
const G2ProjectivePoint &p);
// Kernel: Accumulate ALL windows in parallel using SORT-THEN-REDUCE
// Grid: (num_windows * num_blocks_per_window) blocks
// Each block processes points for ONE window
@@ -160,12 +158,12 @@ __device__ __forceinline__ uint32_t extract_window_bigint(
// Uses mixed addition (affine + projective) to save 3 field muls per add
template <typename AffineType, typename ProjectiveType>
__global__ void kernel_accumulate_all_windows(
ProjectiveType *__restrict__ all_block_buckets, // [num_windows * num_blocks
// * bucket_count]
const AffineType *__restrict__ points, const Scalar *__restrict__ scalars,
uint32_t num_points, uint32_t num_windows, uint32_t num_blocks_per_window,
uint32_t window_size, uint32_t bucket_count) {
using ProjectivePoint = Projective<ProjectiveType>;
ProjectiveType
*all_block_buckets, // [num_windows * num_blocks * bucket_count]
const AffineType *points, const Scalar *scalars, uint32_t num_points,
uint32_t num_windows, uint32_t num_blocks_per_window, uint32_t window_size,
uint32_t bucket_count) {
using ProjectivePoint = ProjectiveSelector<ProjectiveType>;
const uint32_t window_idx = blockIdx.x / num_blocks_per_window;
const uint32_t block_within_window = blockIdx.x % num_blocks_per_window;
@@ -290,14 +288,12 @@ __global__ void kernel_accumulate_all_windows(
// Each block reduces one (window, bucket) pair across all block contributions
template <typename ProjectiveType>
__global__ void kernel_reduce_all_windows(
ProjectiveType
*__restrict__ all_final_buckets, // [num_windows * NUM_BUCKETS]
ProjectiveType *all_final_buckets, // [num_windows * NUM_BUCKETS]
const ProjectiveType
*__restrict__ all_block_buckets, // [num_windows * num_blocks *
// NUM_BUCKETS]
*all_block_buckets, // [num_windows * num_blocks * NUM_BUCKETS]
uint32_t num_windows, uint32_t num_blocks_per_window,
uint32_t num_buckets) {
using ProjectivePoint = Projective<ProjectiveType>;
using ProjectivePoint = ProjectiveSelector<ProjectiveType>;
const uint32_t flat_idx = blockIdx.x;
const uint32_t window_idx = flat_idx / num_buckets;
@@ -361,11 +357,10 @@ __global__ void kernel_reduce_all_windows(
// Each block computes the window sum: sum(i * bucket[i]) for i=1..15
template <typename ProjectiveType>
__global__ void kernel_compute_window_sums(
ProjectiveType *__restrict__ window_sums, // [num_windows]
const ProjectiveType
*__restrict__ all_final_buckets, // [num_windows * NUM_BUCKETS]
ProjectiveType *window_sums, // [num_windows]
const ProjectiveType *all_final_buckets, // [num_windows * NUM_BUCKETS]
uint32_t num_windows, uint32_t num_buckets) {
using ProjectivePoint = Projective<ProjectiveType>;
using ProjectivePoint = ProjectiveSelector<ProjectiveType>;
const uint32_t window_idx = blockIdx.x;
if (window_idx >= num_windows)
@@ -434,22 +429,18 @@ __global__ void kernel_compute_window_sums(
}
// ============================================================================
// CPU Horner Combination
// CPU-side Horner Combination (faster than single-thread GPU)
// ============================================================================
// Combines window sums using Horner's method on the CPU. A single CPU core
// native 64-bit multiply is much faster than a single GPU thread for this
// workload. The CPU path takes ~0.1 ms; a <<<1,1>>> GPU kernel takes ~10-12 ms.
//
// Horner evaluation (MSB-first):
// acc = window_sums[0]
// for w = 1 .. num_windows-1:
// acc = acc * 2^window_size + window_sums[w]
// CPU Horner: combine window sums using Horner's method on host
// Single-threaded CPU execution is faster than single-threaded GPU for this
// sequential operation. The memcpy overhead is smaller than the GPU's memory
// latency penalty for sequential access patterns.
template <typename ProjectiveType>
void horner_combine_cpu(ProjectiveType &result,
const ProjectiveType *window_sums, uint32_t num_windows,
uint32_t window_size) {
using ProjectivePoint = Projective<ProjectiveType>;
using ProjectivePoint = ProjectiveSelector<ProjectiveType>;
ProjectiveType acc;
ProjectivePoint::point_at_infinity(acc);
@@ -472,7 +463,7 @@ void horner_combine_cpu(ProjectiveType &result,
ProjectivePoint::point_copy(acc, temp);
}
} else if (!ProjectivePoint::is_infinity(acc)) {
// Window sum is infinity but accumulator is not -- still shift left
// Window sum is zero, but still need to shift
for (uint32_t i = 0; i < window_size; i++) {
ProjectivePoint::projective_double(temp, acc);
ProjectivePoint::point_copy(acc, temp);
@@ -488,27 +479,25 @@ void horner_combine_cpu(ProjectiveType &result,
// ============================================================================
// Template MSM with BigInt scalars - ALL WINDOWS PARALLEL
// Result is written directly to a host pointer -- no device round-trip needed.
// d_scratch: caller-provided device buffer for intermediate bucket arrays and
// window sums. The caller is responsible for allocating and freeing this
// buffer.
template <typename AffineType, typename ProjectiveType>
void point_msm_pippenger_impl_async(
cudaStream_t stream, uint32_t gpu_index, ProjectiveType *h_result,
const AffineType *d_points, const Scalar *d_scalars, uint32_t n,
uint32_t threads_per_block, uint32_t window_size, uint32_t bucket_count,
ProjectiveType *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
using ProjectivePoint = Projective<ProjectiveType>;
void point_msm_async_pippenger_impl(
cudaStream_t stream, uint32_t gpu_index, ProjectiveType *d_result,
const AffineType *d_points, const Scalar *d_scalars,
ProjectiveType *d_scratch, uint32_t n, uint32_t threads_per_block,
uint32_t window_size, uint32_t bucket_count, uint64_t &size_tracker) {
using ProjectivePoint = ProjectiveSelector<ProjectiveType>;
if (n == 0) {
ProjectivePoint::point_at_infinity(*h_result);
cuda_set_device(gpu_index);
kernel_clear_buckets<ProjectiveType><<<1, 1, 0, stream>>>(d_result, 1);
check_cuda_error(cudaGetLastError());
return;
}
PANIC_IF_FALSE(h_result != nullptr && d_points != nullptr &&
PANIC_IF_FALSE(n > 0, "point_msm_async_pippenger_impl: invalid size n=%u", n);
PANIC_IF_FALSE(d_result != nullptr && d_points != nullptr &&
d_scalars != nullptr && d_scratch != nullptr,
"point_msm_pippenger_impl_async: null pointer argument");
"point_msm_async_pippenger_impl: null pointer argument");
cuda_set_device(gpu_index);
@@ -535,20 +524,34 @@ void point_msm_pippenger_impl_async(
const size_t total_scratch =
all_block_buckets_size + all_final_buckets_size + num_windows;
// Partition the caller-provided scratch buffer into sub-regions
ProjectiveType *d_all_block_buckets = d_scratch;
ProjectiveType *d_all_final_buckets = d_scratch + all_block_buckets_size;
// Check for overflow before allocating scratch space
size_t scratch_bytes = 0;
bool scratch_overflow = __builtin_mul_overflow(
total_scratch, sizeof(ProjectiveType), &scratch_bytes);
PANIC_IF_FALSE(!scratch_overflow,
"point_msm_async_pippenger_impl: scratch allocation overflow "
"(total_scratch=%zu, element_size=%zu)",
total_scratch, sizeof(ProjectiveType));
// Allocate internal scratch space (user-provided scratch is too small for
// all-windows-parallel)
ProjectiveType *d_internal_scratch =
(ProjectiveType *)cuda_malloc_with_size_tracking_async(
scratch_bytes, stream, gpu_index, size_tracker, true);
ProjectiveType *d_all_block_buckets = d_internal_scratch;
ProjectiveType *d_all_final_buckets =
d_internal_scratch + all_block_buckets_size;
ProjectiveType *d_window_sums = d_all_final_buckets + all_final_buckets_size;
// Clear all scratch space
const uint32_t clear_blocks = CEIL_DIV(total_scratch, KERNEL_THREADS_MAX);
PANIC_IF_FALSE(clear_blocks * KERNEL_THREADS_MAX >= total_scratch,
"kernel_clear_buckets: insufficient threads (%zu) to clear "
"buffer (%zu elements)",
static_cast<size_t>(clear_blocks) * KERNEL_THREADS_MAX,
total_scratch);
"kernel_clear_buckets: insufficient threads (%u) to clear "
"buffer (%u elements)",
clear_blocks * KERNEL_THREADS_MAX, total_scratch);
kernel_clear_buckets<ProjectiveType>
<<<clear_blocks, KERNEL_THREADS_MAX, 0, stream>>>(d_scratch,
<<<clear_blocks, KERNEL_THREADS_MAX, 0, stream>>>(d_internal_scratch,
total_scratch);
check_cuda_error(cudaGetLastError());
@@ -557,10 +560,8 @@ void point_msm_pippenger_impl_async(
num_windows * launch_params.num_blocks_per_window;
PANIC_IF_FALSE(
total_accum_blocks * bucket_count <= all_block_buckets_size,
"kernel_accumulate_all_windows: max write index (%zu) exceeds buffer "
"(%zu)",
static_cast<size_t>(total_accum_blocks) * bucket_count,
all_block_buckets_size);
"kernel_accumulate_all_windows: max write index (%u) exceeds buffer (%u)",
total_accum_blocks * bucket_count, all_block_buckets_size);
kernel_accumulate_all_windows<AffineType, ProjectiveType>
<<<total_accum_blocks, launch_params.adjusted_threads_per_block,
launch_params.accum_shared_mem, stream>>>(
@@ -574,7 +575,7 @@ void point_msm_pippenger_impl_async(
launch_params.num_blocks_per_window, gpu_index);
PANIC_IF_FALSE(
total_reduce_blocks <= all_final_buckets_size,
"kernel_reduce_all_windows: blocks (%u) exceeds output buffer (%zu)",
"kernel_reduce_all_windows: blocks (%u) exceeds output buffer (%u)",
total_reduce_blocks, all_final_buckets_size);
kernel_reduce_all_windows<ProjectiveType>
<<<total_reduce_blocks, reduce_params.adjusted_threads,
@@ -587,31 +588,36 @@ void point_msm_pippenger_impl_async(
// Round up to next multiple of 32 (warp size) for efficient scheduling.
// The kernel already has `if (tid < n)` bounds checks for the excess threads.
const uint32_t combine_threads = ((bucket_count - 1) + 31) & ~31u;
const size_t combine_shared_mem =
safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(combine_threads));
const size_t combine_shared_mem = combine_threads * sizeof(ProjectiveType);
PANIC_IF_FALSE(num_windows * bucket_count <= all_final_buckets_size,
"kernel_compute_window_sums: max read index (%zu) exceeds "
"input buffer (%zu)",
static_cast<size_t>(num_windows) * bucket_count,
all_final_buckets_size);
"kernel_compute_window_sums: max read index (%u) exceeds "
"input buffer (%u)",
num_windows * bucket_count, all_final_buckets_size);
kernel_compute_window_sums<ProjectiveType>
<<<num_windows, combine_threads, combine_shared_mem, stream>>>(
d_window_sums, d_all_final_buckets, num_windows, bucket_count);
check_cuda_error(cudaGetLastError());
// Phase 4: CPU Horner combine, result written directly to host pointer
//
// The Horner loop is inherently sequential. A single CPU core is much faster
// than a single GPU thread for this workload, so we run Horner on the CPU
// and write the result directly to the caller's host pointer.
// Phase 4: CPU-side Horner combine (faster than single GPU thread!)
// Download window sums to host
std::vector<ProjectiveType> h_window_sums(num_windows);
cuda_memcpy_async_to_cpu(
h_window_sums.data(), d_window_sums,
safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(num_windows)), stream,
gpu_index);
cuda_memcpy_async_to_cpu(h_window_sums.data(), d_window_sums,
num_windows * sizeof(ProjectiveType), stream,
gpu_index);
cuda_synchronize_stream(stream, gpu_index);
horner_combine_cpu(*h_result, h_window_sums.data(), num_windows, window_size);
// Perform Horner combination on CPU
ProjectiveType h_result;
horner_combine_cpu(h_result, h_window_sums.data(), num_windows, window_size);
// Upload result back to device
cuda_memcpy_async_to_gpu(d_result, &h_result, sizeof(ProjectiveType), stream,
gpu_index);
// Cleanup - must sync before returning since h_result is a local variable
cuda_synchronize_stream(stream, gpu_index);
cuda_drop_with_size_tracking_async(d_internal_scratch, stream, gpu_index,
true);
}
// ============================================================================
@@ -646,92 +652,36 @@ inline void get_g2_window_params(uint32_t n, uint32_t &window_size,
bucket_count = MSM_G2_BUCKET_COUNT; // 32 buckets
}
// ============================================================================
// Scratch Size Computation
// ============================================================================
// Computes the exact scratch buffer size (in bytes) needed by
// point_msm_pippenger_impl_async for a given input count n. The formula must
// stay in sync with the scratch partitioning inside that function:
// all_block_buckets: num_windows * num_blocks_per_window * bucket_count
// all_final_buckets: num_windows * bucket_count
// window_sums: num_windows
// Factoring this into a helper avoids duplicating the formula in every caller
// and prevents the buffer-underallocation bug that occurs when callers use
// ad-hoc estimates.
template <typename AffineType, typename ProjectiveType>
size_t pippenger_scratch_size(uint32_t n, uint32_t gpu_index) {
if (n == 0)
return 0;
uint32_t window_size, bucket_count;
// Use the same window parameter selection as the MSM entry points
if constexpr (std::is_same_v<AffineType, G1Affine>) {
get_g1_window_params(n, window_size, bucket_count);
} else {
get_g2_window_params(n, window_size, bucket_count);
}
const uint32_t threads_per_block = msm_threads_per_block<AffineType>(n);
const uint32_t num_windows = CEIL_DIV(Scalar::NUM_BITS, window_size);
// Phase1KernelLaunchParams computes the adjusted threads per block
// respecting shared memory limits, which determines num_blocks_per_window
Phase1KernelLaunchParams<AffineType> launch_params(n, threads_per_block,
bucket_count, gpu_index);
const size_t all_block_buckets_elems = static_cast<size_t>(num_windows) *
launch_params.num_blocks_per_window *
bucket_count;
const size_t all_final_buckets_elems =
static_cast<size_t>(num_windows) * bucket_count;
const size_t total_elems =
all_block_buckets_elems + all_final_buckets_elems + num_windows;
return safe_mul_sizeof<ProjectiveType>(total_elems);
}
// Non-template wrappers so callers outside this TU (c_wrapper.cu, tests, etc.)
// can compute the correct scratch size without access to template internals.
size_t pippenger_scratch_size_g1(uint32_t n, uint32_t gpu_index) {
return pippenger_scratch_size<G1Affine, G1Projective>(n, gpu_index);
}
size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index) {
return pippenger_scratch_size<G2Point, G2ProjectivePoint>(n, gpu_index);
}
// MSM with BigInt scalars for G1 (projective coordinates internally)
void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result,
void point_msm_async_g1_pippenger(cudaStream_t stream, uint32_t gpu_index,
G1Projective *d_result,
const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch,
uint64_t &size_tracker,
bool gpu_memory_allocated) {
const Scalar *d_scalars,
G1Projective *d_scratch, uint32_t n,
uint64_t &size_tracker) {
uint32_t window_size, bucket_count;
get_g1_window_params(n, window_size, bucket_count);
point_msm_pippenger_impl_async<G1Affine, G1Projective>(
stream, gpu_index, h_result, d_points, d_scalars, n,
msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch,
size_tracker, gpu_memory_allocated);
point_msm_async_pippenger_impl<G1Affine, G1Projective>(
stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n,
get_msm_threads_per_block<G1Affine>(n), window_size, bucket_count,
size_tracker);
}
// MSM with BigInt scalars for G2 (projective coordinates internally)
// Uses larger window size to reduce Horner doublings (G2 ops are 2x more
// expensive)
void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result,
void point_msm_async_g2_pippenger(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *d_result,
const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch,
uint64_t &size_tracker,
bool gpu_memory_allocated) {
const Scalar *d_scalars,
G2ProjectivePoint *d_scratch, uint32_t n,
uint64_t &size_tracker) {
uint32_t window_size, bucket_count;
get_g2_window_params(n, window_size, bucket_count);
point_msm_pippenger_impl_async<G2Point, G2ProjectivePoint>(
stream, gpu_index, h_result, d_points, d_scalars, n,
msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch,
size_tracker, gpu_memory_allocated);
point_msm_async_pippenger_impl<G2Point, G2ProjectivePoint>(
stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n,
get_msm_threads_per_block<G2Point>(n), window_size, bucket_count,
size_tracker);
}

View File

@@ -1,7 +1,6 @@
#include "bls12_446_params.h"
#include "device.h"
#include "fp.h"
#include "fp_ptx32.cuh"
#include <cstdio>
#include <cstdlib>
#include <cstring>
@@ -188,9 +187,6 @@ __host__ __device__ void fp_copy(Fp &dst, const Fp &src) {
// "Raw" means without modular reduction - performs a + b and returns carry.
// This is an internal helper used by fp_add() which handles reduction.
__host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
return fp_add_raw_ptx32(c, a, b);
#else
UNSIGNED_LIMB carry = 0;
for (int i = 0; i < FP_LIMBS; i++) {
@@ -203,16 +199,12 @@ __host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
}
return carry;
#endif
}
// Subtraction with borrow propagation
// "Raw" means without modular reduction - performs a - b and returns borrow.
// This is an internal helper used by fp_sub() which handles reduction.
__host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
return fp_sub_raw_ptx32(c, a, b);
#else
UNSIGNED_LIMB borrow = 0;
for (int i = 0; i < FP_LIMBS; i++) {
@@ -226,15 +218,11 @@ __host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
}
return borrow;
#endif
}
// Addition with modular reduction: c = (a + b) mod p
// MONTGOMERY: Both inputs and output must be in Montgomery form
__host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
fp_add_ptx32(c, a, b);
#else
Fp sum;
UNSIGNED_LIMB carry = fp_add_raw(sum, a, b);
@@ -247,15 +235,11 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
} else {
fp_copy(c, sum);
}
#endif
}
// Subtraction with modular reduction: c = (a - b) mod p
// MONTGOMERY: Both inputs and output must be in Montgomery form
__host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
fp_sub_ptx32(c, a, b);
#else
Fp diff;
UNSIGNED_LIMB borrow = fp_sub_raw(diff, a, b);
@@ -266,31 +250,6 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
} else {
fp_copy(c, diff);
}
#endif
}
// Small-constant multiplication via addition chains.
// These replace full Montgomery multiplications by 2, 3, 4, 8 with a few
// modular additions, each ~25 instructions vs ~200+ for CIOS Montgomery mul.
__host__ __device__ void fp_double(Fp &c, const Fp &a) { fp_add(c, a, a); }
__host__ __device__ void fp_mul3(Fp &c, const Fp &a) {
Fp t;
fp_add(t, a, a);
fp_add(c, t, a);
}
__host__ __device__ void fp_mul4(Fp &c, const Fp &a) {
Fp t;
fp_add(t, a, a);
fp_add(c, t, t);
}
__host__ __device__ void fp_mul8(Fp &c, const Fp &a) {
Fp t;
fp_mul4(t, a);
fp_add(c, t, t);
}
// Helper function for limb multiplication: LIMB_BITS x LIMB_BITS -> 2*LIMB_BITS
@@ -475,9 +434,6 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
// Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
// Both a and b are in Montgomery form, result is in Montgomery form.
__host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
fp_mont_mul_cios_ptx32(c, a, b);
#else
const Fp &p = fp_modulus();
UNSIGNED_LIMB p_prime = fp_p_prime();
@@ -565,7 +521,6 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
fp_copy(c, reduced);
}
// Result is in Montgomery form
#endif
}
// Montgomery multiplication: c = (a * b * R_INV) mod p
@@ -885,24 +840,32 @@ __host__ __device__ Fp operator-(const Fp &a, const Fp &b) {
return c;
}
// TODO: This operator returns Montgomery form while operator+ and operator-
// preserve the input form. This inconsistency means expressions like
// `a + (b * c)` produce incorrect results. Verify all call sites and decide
// whether to convert the result back to normal form or remove this operator.
//
// Binary multiplication: a * b
// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
// form. This is consistent with operator+ and operator- which also require
// Montgomery-form inputs.
// EXTERNAL API: Accepts normal form inputs, converts to Montgomery, and returns
// Montgomery form result. For internal operations where inputs are already in
// Montgomery form, use fp_mont_mul() directly.
__host__ __device__ Fp operator*(const Fp &a, const Fp &b) {
Fp result;
fp_mont_mul(result, a, b);
Fp a_mont, b_mont, result;
// Convert from normal form to Montgomery form for computation
fp_to_montgomery(a_mont, a);
fp_to_montgomery(b_mont, b);
// Multiply in Montgomery form - result stays in Montgomery form
fp_mont_mul(result, a_mont, b_mont);
return result;
}
// Binary division: a / b
// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
// form. Computes a * b^{-1} entirely in Montgomery representation.
__host__ __device__ Fp operator/(const Fp &a, const Fp &b) {
Fp b_inv;
fp_mont_inv(b_inv, b);
Fp c;
fp_mont_mul(c, a, b_inv);
fp_div(c, a, b);
return c;
}
@@ -936,23 +899,16 @@ __host__ __device__ Fp &operator-=(Fp &a, const Fp &b) {
}
// Compound multiplication: a *= b
// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
// form.
__host__ __device__ Fp &operator*=(Fp &a, const Fp &b) {
Fp temp;
fp_mont_mul(temp, a, b);
Fp temp = a * b;
fp_copy(a, temp);
return a;
}
// Compound division: a /= b
// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
// form.
__host__ __device__ Fp &operator/=(Fp &a, const Fp &b) {
Fp b_inv;
fp_mont_inv(b_inv, b);
Fp temp;
fp_mont_mul(temp, a, b_inv);
fp_div(temp, a, b);
fp_copy(a, temp);
return a;
}

View File

@@ -74,30 +74,6 @@ __host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b) {
fp_sub(c.c1, a.c1, b.c1);
}
// Small-constant multiplication via addition chains.
// These replace full Fp2 Montgomery multiplications by 2, 3, 4, 8 with
// modular additions on each component.
__host__ __device__ void fp2_double(Fp2 &c, const Fp2 &a) {
fp_double(c.c0, a.c0);
fp_double(c.c1, a.c1);
}
__host__ __device__ void fp2_mul3(Fp2 &c, const Fp2 &a) {
fp_mul3(c.c0, a.c0);
fp_mul3(c.c1, a.c1);
}
__host__ __device__ void fp2_mul4(Fp2 &c, const Fp2 &a) {
fp_mul4(c.c0, a.c0);
fp_mul4(c.c1, a.c1);
}
__host__ __device__ void fp2_mul8(Fp2 &c, const Fp2 &a) {
fp_mul8(c.c0, a.c0);
fp_mul8(c.c1, a.c1);
}
// Multiplication: c = a * b
// (a0 + a1*i) * (b0 + b1*i) = (a0*b0 - a1*b1) + (a0*b1 + a1*b0)*i
// Optimized: converts to Montgomery once at start, operates, converts back at
@@ -166,40 +142,29 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b) {
fp_sub(c.c1, c.c1, t1);
}
// Montgomery squaring: c = a^2 (all in Montgomery form)
// Uses the complex-squaring identity for Fp2 = Fp[i]/(i^2+1):
// c0 = (a0 + a1)(a0 - a1) [since a0^2 - a1^2 = (a0+a1)(a0-a1)]
// c1 = 2 * a0 * a1
// This requires only 2 Fp multiplications vs 3 for general fp2_mont_mul.
// NOTE: All inputs and outputs are in Montgomery form
// Safe when c aliases a: all reads of a complete before any write to c.
__host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a) {
Fp sum, diff, c0_tmp, prod;
fp_add(sum, a.c0, a.c1);
fp_sub(diff, a.c0, a.c1);
fp_mont_mul(c0_tmp, sum, diff);
fp_mont_mul(prod, a.c0, a.c1);
fp_double(c.c1, prod);
fp_copy(c.c0, c0_tmp);
}
// Squaring with Montgomery conversion: c = a^2
// Converts to Montgomery form, uses the 2-mul complex-squaring formula,
// and converts back.
// Optimized: converts to Montgomery once at start, operates, converts back at
// end (4 conversions instead of 9)
__host__ __device__ void fp2_square(Fp2 &c, const Fp2 &a) {
// Convert inputs to Montgomery form once
Fp a0_m, a1_m;
fp_to_montgomery(a0_m, a.c0);
fp_to_montgomery(a1_m, a.c1);
// Use the 2-mul complex-squaring identity in Montgomery form
Fp2 a_m = {a0_m, a1_m};
Fp2 c_m;
fp2_mont_square(c_m, a_m);
// Operate in Montgomery form
Fp t0, t1, t2;
fp_mont_mul(t0, a0_m, a0_m); // t0 = a0^2
fp_mont_mul(t1, a1_m, a1_m); // t1 = a1^2
fp_add(t2, a0_m, a1_m); // t2 = a0 + a1
fp_mont_mul(t2, t2, t2); // t2 = (a0 + a1)^2
fp_from_montgomery(c.c0, c_m.c0);
fp_from_montgomery(c.c1, c_m.c1);
Fp c0_m, c1_m;
fp_sub(c0_m, t0, t1); // c0 = a0^2 - a1^2
fp_sub(c1_m, t2, t0); // c1 = (a0+a1)^2 - a0^2
fp_sub(c1_m, c1_m, t1); // c1 = (a0+a1)^2 - a0^2 - a1^2 = 2*a0*a1
// Convert outputs back from Montgomery form
fp_from_montgomery(c.c0, c0_m);
fp_from_montgomery(c.c1, c1_m);
}
__host__ __device__ void fp2_neg(Fp2 &c, const Fp2 &a) {

View File

@@ -8,8 +8,6 @@
#include <cuda_runtime.h>
#include <random>
#include "checked_arithmetic.h"
// Helper to get modulus (use fp_modulus() from the library)
static Fp get_modulus() { return fp_modulus(); }
@@ -94,13 +92,27 @@ static void BM_G1_MSM(benchmark::State &state) {
const auto n = static_cast<int>(state.range(0));
std::mt19937_64 rng(42);
// Calculate required scratch space
const int threadsPerBlock =
get_msm_threads_per_block<G1Affine>(n); // Must match MSM implementation
const auto num_blocks = CEIL_DIV(n, threadsPerBlock);
const auto scratch_size =
(num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
// Allocate device memory
auto *d_points = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<G1Affine>(static_cast<size_t>(n)), g_benchmark_stream,
g_gpu_index, size_tracker, true));
n * sizeof(G1Affine), g_benchmark_stream, g_gpu_index, size_tracker,
true));
auto *d_scalars = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Scalar>(static_cast<size_t>(n)), g_benchmark_stream,
g_gpu_index, size_tracker, true));
n * sizeof(Scalar), g_benchmark_stream, g_gpu_index, size_tracker, true));
auto *d_result =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Projective), g_benchmark_stream, g_gpu_index, size_tracker,
true));
auto *d_scratch =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
scratch_size, g_benchmark_stream, g_gpu_index, size_tracker, true));
// Prepare host data
auto *h_points = new G1Affine[n];
auto *h_scalars = new Scalar[n];
@@ -113,11 +125,11 @@ static void BM_G1_MSM(benchmark::State &state) {
// Copy to device (once, before benchmark loop)
cuda_memcpy_with_size_tracking_async_to_gpu(
d_points, h_points, safe_mul_sizeof<G1Affine>(static_cast<size_t>(n)),
g_benchmark_stream, g_gpu_index, true);
d_points, h_points, n * sizeof(G1Affine), g_benchmark_stream, g_gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_scalars, h_scalars, safe_mul_sizeof<Scalar>(static_cast<size_t>(n)),
g_benchmark_stream, g_gpu_index, true);
d_scalars, h_scalars, n * sizeof(Scalar), g_benchmark_stream, g_gpu_index,
true);
// Convert points to Montgomery form (required for performance - all
// operations use Montgomery)
@@ -125,29 +137,26 @@ static void BM_G1_MSM(benchmark::State &state) {
n);
check_cuda_error(cudaGetLastError());
// Allocate scratch buffer sized to match the pippenger internal partitioning
size_t g1_scratch_bytes = pippenger_scratch_size_g1(n, g_gpu_index);
auto *d_scratch = static_cast<G1Projective *>(
cuda_malloc_with_size_tracking_async(g1_scratch_bytes, g_benchmark_stream,
g_gpu_index, size_tracker, true));
// Initialize result and scratch memory to zero (once, before benchmark loop)
cuda_memset_with_size_tracking_async(d_result, 0, sizeof(G1Projective),
g_benchmark_stream, g_gpu_index, true);
cuda_memset_with_size_tracking_async(d_scratch, 0, scratch_size,
g_benchmark_stream, g_gpu_index, true);
// Synchronize once before benchmark loop to ensure all setup is complete
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
// Result written directly to host -- no device allocation needed
G1Projective h_result;
// Warm-up iterations
for (int i = 0; i < WARMUP_ITERATIONS; i++) {
point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch, size_tracker, true);
point_msm_async_g1(g_benchmark_stream, g_gpu_index, d_result, d_points,
d_scalars, d_scratch, n, size_tracker);
}
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
// Benchmark loop: only measure the MSM computation, no memory operations
for (auto _ : state) {
point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch, size_tracker, true);
point_msm_async_g1(g_benchmark_stream, g_gpu_index, d_result, d_points,
d_scalars, d_scratch, n, size_tracker);
benchmark::ClobberMemory();
}
@@ -159,12 +168,14 @@ static void BM_G1_MSM(benchmark::State &state) {
delete[] h_points;
delete[] h_scalars;
cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_points, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_scalars, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_result, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
true);
}
// Benchmark G2 MSM with random points and 320-bit scalars
@@ -175,13 +186,27 @@ static void BM_G2_MSM(benchmark::State &state) {
const auto n = static_cast<int>(state.range(0));
std::mt19937_64 rng(42);
// Calculate required scratch space
const int threadsPerBlock =
get_msm_threads_per_block<G2Affine>(n); // Must match MSM implementation
const auto num_blocks = CEIL_DIV(n, threadsPerBlock);
const auto scratch_size =
(num_blocks + 1) * MSM_G2_BUCKET_COUNT * sizeof(G2Projective);
// Allocate device memory
auto *d_points = static_cast<G2Affine *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<G2Affine>(static_cast<size_t>(n)), g_benchmark_stream,
g_gpu_index, size_tracker, true));
n * sizeof(G2Affine), g_benchmark_stream, g_gpu_index, size_tracker,
true));
auto *d_scalars = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Scalar>(static_cast<size_t>(n)), g_benchmark_stream,
g_gpu_index, size_tracker, true));
n * sizeof(Scalar), g_benchmark_stream, g_gpu_index, size_tracker, true));
auto *d_result =
static_cast<G2Projective *>(cuda_malloc_with_size_tracking_async(
sizeof(G2Projective), g_benchmark_stream, g_gpu_index, size_tracker,
true));
auto *d_scratch =
static_cast<G2Projective *>(cuda_malloc_with_size_tracking_async(
scratch_size, g_benchmark_stream, g_gpu_index, size_tracker, true));
// Prepare host data
auto *h_points = new G2Affine[n];
auto *h_scalars = new Scalar[n];
@@ -194,11 +219,11 @@ static void BM_G2_MSM(benchmark::State &state) {
// Copy to device (once, before benchmark loop)
cuda_memcpy_with_size_tracking_async_to_gpu(
d_points, h_points, safe_mul_sizeof<G2Affine>(static_cast<size_t>(n)),
g_benchmark_stream, g_gpu_index, true);
d_points, h_points, n * sizeof(G2Affine), g_benchmark_stream, g_gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_scalars, h_scalars, safe_mul_sizeof<Scalar>(static_cast<size_t>(n)),
g_benchmark_stream, g_gpu_index, true);
d_scalars, h_scalars, n * sizeof(Scalar), g_benchmark_stream, g_gpu_index,
true);
// Convert points to Montgomery form (required for performance - all
// operations use Montgomery)
@@ -206,29 +231,26 @@ static void BM_G2_MSM(benchmark::State &state) {
n);
check_cuda_error(cudaGetLastError());
// Allocate scratch buffer sized to match the pippenger internal partitioning
size_t g2_scratch_bytes = pippenger_scratch_size_g2(n, g_gpu_index);
auto *d_scratch = static_cast<G2Projective *>(
cuda_malloc_with_size_tracking_async(g2_scratch_bytes, g_benchmark_stream,
g_gpu_index, size_tracker, true));
// Initialize result and scratch memory to zero (once, before benchmark loop)
cuda_memset_with_size_tracking_async(d_result, 0, sizeof(G2Projective),
g_benchmark_stream, g_gpu_index, true);
cuda_memset_with_size_tracking_async(d_scratch, 0, scratch_size,
g_benchmark_stream, g_gpu_index, true);
// Synchronize once before benchmark loop to ensure all setup is complete
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
// Result written directly to host -- no device allocation needed
G2Projective h_result;
// Warm-up iterations
for (int i = 0; i < WARMUP_ITERATIONS; i++) {
point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch, size_tracker, true);
point_msm_async_g2(g_benchmark_stream, g_gpu_index, d_result, d_points,
d_scalars, d_scratch, n, size_tracker);
}
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
// Benchmark loop: only measure the MSM computation, no memory operations
for (auto _ : state) {
point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch, size_tracker, true);
point_msm_async_g2(g_benchmark_stream, g_gpu_index, d_result, d_points,
d_scalars, d_scratch, n, size_tracker);
benchmark::ClobberMemory();
}
@@ -240,12 +262,14 @@ static void BM_G2_MSM(benchmark::State &state) {
delete[] h_points;
delete[] h_scalars;
cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_points, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_scalars, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_result, g_benchmark_stream, g_gpu_index,
true);
cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
true);
}
// Register MSM benchmarks with sizes matching the Rust Criterion benchmarks

View File

@@ -77,6 +77,3 @@ gtest_discover_tests(test_fp)
gtest_discover_tests(test_fp2)
gtest_discover_tests(test_msm)
gtest_discover_tests(test_point_ops)
# Basic usage examples (standalone programs, not registered with CTest)
add_subdirectory(basic)

View File

@@ -1,34 +0,0 @@
# Basic usage examples for zk-cuda-backend. These are standalone programs for learning purposes, not part of the CTest
# suite. Run them directly after building to verify the API works end-to-end.
set(ZK_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../include)
# basic_fp_ops: host-side Fp field arithmetic
add_executable(basic_fp_ops basic_fp_ops.cu)
target_link_libraries(basic_fp_ops zk_cuda_backend tfhe_device)
target_include_directories(basic_fp_ops PRIVATE ${ZK_INCLUDE_DIR})
set_target_properties(
basic_fp_ops
PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON)
# basic_curve_ops: host-side G1 elliptic curve operations
add_executable(basic_curve_ops basic_curve_ops.cu)
target_link_libraries(basic_curve_ops zk_cuda_backend tfhe_device)
target_include_directories(basic_curve_ops PRIVATE ${ZK_INCLUDE_DIR})
set_target_properties(
basic_curve_ops
PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON)
# basic_msm: GPU-accelerated multi-scalar multiplication
add_executable(basic_msm basic_msm.cu)
target_link_libraries(basic_msm zk_cuda_backend tfhe_device)
target_include_directories(basic_msm PRIVATE ${ZK_INCLUDE_DIR})
set_target_properties(
basic_msm
PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON)

View File

@@ -1,90 +0,0 @@
// Basic elliptic curve operations on BLS12-446 G1.
//
// Demonstrates G1 projective point arithmetic on the host. Projective points
// support operator overloads (+, -, *, ==) that cover the common use cases.
// Affine points are used for input/output; coordinates are in Montgomery form
// during arithmetic and converted back by normalize_from_montgomery_g1().
//
// See README.md and include/curve.h for the full API reference.
//
// Build (from cuda/):
// cmake -B build -DZK_CUDA_BACKEND_BUILD_TESTS=ON
// cmake --build build --target basic_curve_ops
// ./build/tests_and_benchmarks/tests/basic/basic_curve_ops
#include "curve.h"
#include "fp.h"
#include <cassert>
#include <cstdio>
#include <cstring>
int main() {
// ---- Generator point ----
// g1_generator() returns the hardcoded BLS12-446 G1 generator in normal
// (non-Montgomery) form. Convert to Montgomery, then lift to projective for
// host-side arithmetic.
const G1Affine &gen_normal = g1_generator();
assert(!g1_is_infinity(gen_normal));
G1Affine gen_affine = gen_normal;
point_to_montgomery_inplace(gen_affine);
G1Projective G;
affine_to_projective(G, gen_affine);
// ---- Negation: -G ----
G1Projective neg_G = -G;
// G + (-G) = identity (Z = 0 in the projective convention)
G1Projective identity = G + neg_G;
assert(fp_is_zero(identity.Z));
printf("Negation (-G) and G + (-G) = identity: OK\n");
// ---- Addition: 2*G = G + G, 3*G = 2*G + G ----
G1Projective two_G = G + G;
assert(!(two_G == G1Projective())); // not the identity
G1Projective three_G = two_G + G;
assert(!(three_G == G1Projective()));
printf("Addition (2*G, 3*G): OK\n");
// ---- Compound assignment: G += G ----
G1Projective acc = G;
acc += G; // acc = 2*G
assert(acc == two_G);
printf("Compound assignment (+=): OK\n");
// ---- Scalar multiplication: 3*G using Scalar type ----
// The * operator calls projective_scalar_mul internally.
Scalar scalar_3;
memset(&scalar_3, 0, sizeof(scalar_3));
scalar_3.limb[0] = 3;
G1Projective three_G_via_scalar = G * scalar_3;
assert(!(three_G_via_scalar == G1Projective()));
// Normalise both to Z = 1 (Montgomery) before comparing coordinates.
normalize_projective_g1(three_G);
normalize_projective_g1(three_G_via_scalar);
assert(three_G == three_G_via_scalar);
printf("Scalar multiplication (3*G == G + G + G): OK\n");
// ---- Projective -> affine conversion ----
// projective_to_affine_g1 keeps coordinates in Montgomery form.
G1Affine three_G_affine;
projective_to_affine_g1(three_G_affine, three_G);
assert(!g1_is_infinity(three_G_affine));
printf("Projective -> affine conversion: OK\n");
// ---- Convert to normal-form coordinates ----
// normalize_from_montgomery_g1 strips Montgomery form and sets Z = 1 in one
// pass.
G1Projective result = three_G_via_scalar;
normalize_from_montgomery_g1(
result); // coordinates now in normal (non-Montgomery) form
assert(!fp_is_zero(result.Z)); // Z = 1 (non-zero)
printf("Conversion to normal-form projective: OK\n");
printf("All G1 curve operations passed.\n");
return 0;
}

View File

@@ -1,107 +0,0 @@
// Basic finite field (Fp) arithmetic over BLS12-446.
//
// Demonstrates host-side Fp operations intended as a learning reference.
// All arithmetic in the field is modular with respect to the BLS12-446 prime.
//
// Internal representation uses Montgomery form for multiplications.
// See README.md and include/fp.h for the full API reference.
//
// Build (from cuda/):
// cmake -B build -DZK_CUDA_BACKEND_BUILD_TESTS=ON
// cmake --build build --target basic_fp_ops
// ./build/tests_and_benchmarks/tests/basic/basic_fp_ops
#include "fp.h"
#include <cassert>
#include <cstdio>
int main() {
// ---- Addition and subtraction ----
// fp_one() and fp_zero() produce values in normal (non-Montgomery) form.
// fp_add / fp_sub perform modular addition/subtraction and are form-agnostic
// (addition is linear, so the result stays in the same form).
Fp a, b, c;
fp_one(a); // a = 1
fp_one(b); // b = 1
c = a + b; // c = 2
assert(c.limb[0] == 2);
c = c - a; // c = 1
assert(fp_is_one(c));
// Compound assignment
c += a; // c = 2
assert(c.limb[0] == 2);
c -= b; // c = 1
assert(fp_is_one(c));
printf("Addition/subtraction: OK\n");
// ---- Negation ----
// fp_neg computes p - a (mod p). For consistency use values in Montgomery
// form, but for add/sub/neg small normal-form values also work correctly.
Fp neg_a = -a; // neg_a = -1 mod p
Fp sum = a + neg_a;
assert(fp_is_zero(sum)); // 1 + (-1) = 0
printf("Negation: OK\n");
// ---- Multiplication (Montgomery form required) ----
// The * operator calls fp_mont_mul, which requires both operands to be in
// Montgomery form. Use fp_to_montgomery() to convert, or the helper
// fp_one_montgomery() / fp_two_montgomery() for small constants.
Fp one_m, two_m, result_m, result;
fp_one_montgomery(one_m); // one_m = 1 in Montgomery form
fp_two_montgomery(two_m); // two_m = 2 in Montgomery form
result_m = one_m * two_m; // result_m = 2 in Montgomery form
fp_from_montgomery(result, result_m);
assert(result.limb[0] == 2);
result_m = two_m * two_m; // result_m = 4 in Montgomery form
fp_from_montgomery(result, result_m);
assert(result.limb[0] == 4);
// Compound multiplication
result_m = two_m;
result_m *= two_m; // result_m = 4
fp_from_montgomery(result, result_m);
assert(result.limb[0] == 4);
// Convert an arbitrary normal-form value to Montgomery before multiplying
Fp five_normal, five_m, twenty_five_m, twenty_five;
fp_zero(five_normal);
five_normal.limb[0] = 5;
fp_to_montgomery(five_m, five_normal);
fp_mont_mul(twenty_five_m, five_m, five_m); // 5 * 5 = 25
fp_from_montgomery(twenty_five, twenty_five_m);
assert(twenty_five.limb[0] == 25);
printf("Multiplication: OK\n");
// ---- Inversion and division (normal-form convenience API) ----
// fp_inv and fp_div accept and return values in normal form (they handle
// the Montgomery conversion internally).
Fp five_inv;
fp_inv(five_inv, five_normal); // five_inv = 5^{-1} mod p
Fp one_check;
fp_div(one_check, five_normal, five_normal); // 5 / 5 = 1
assert(fp_is_one(one_check));
// Verify: 5 * 5^{-1} == 1 (using fp_div as a cross-check)
Fp product;
fp_zero(product);
product.limb[0] = 1; // product = 1
Fp two_normal;
fp_zero(two_normal);
two_normal.limb[0] = 2;
fp_div(product, two_normal, two_normal); // 2 / 2 = 1
assert(fp_is_one(product));
printf("Inversion/division: OK\n");
printf("All Fp operations passed.\n");
return 0;
}

View File

@@ -1,109 +0,0 @@
// Basic Multi-Scalar Multiplication (MSM) on BLS12-446 G1.
//
// Demonstrates the unmanaged GPU MSM API:
// - Allocating device memory for points, scalars, result, and scratch space
// - Copying data to the GPU and running point_msm_g1()
// - Reading the result back and verifying against a naive scalar-mul sum
//
// The unmanaged API requires the caller to manage all allocations. For a
// higher-level interface that handles memory internally, see the Rust bindings
// (G1Projective::msm in the Rust API).
//
// See README.md and include/msm.h for the full API reference.
//
// Build (from cuda/):
// cmake -B build -DZK_CUDA_BACKEND_BUILD_TESTS=ON
// cmake --build build --target basic_msm
// ./build/tests_and_benchmarks/tests/basic/basic_msm
#include "curve.h"
#include "device.h"
#include "fp.h"
#include "msm.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <vector>
int main() {
if (!cuda_is_available()) {
printf("CUDA not available, skipping.\n");
return 0;
}
const uint32_t gpu_index = 0;
const uint32_t n = 4; // number of points / scalars
uint64_t size_tracker = 0;
// ---- Prepare host-side points in Montgomery form ----
// Use n doublings of the G1 generator: G, 2*G, 4*G, 8*G.
const G1Affine &gen_normal = g1_generator();
G1Affine gen = gen_normal;
point_to_montgomery_inplace(gen);
std::vector<G1Affine> h_points(n);
h_points[0] = gen;
for (uint32_t i = 1; i < n; i++) {
point_double(h_points[i], h_points[i - 1]);
}
// ---- Prepare host-side scalars ----
// Each scalar is a 320-bit little-endian integer (ZP_LIMBS × LIMB_BITS).
// Use scalar[i] = i + 1, so MSM = 1*G + 2*(2G) + 3*(4G) + 4*(8G).
std::vector<Scalar> h_scalars(n);
for (uint32_t i = 0; i < n; i++) {
memset(&h_scalars[i], 0, sizeof(Scalar));
h_scalars[i].limb[0] = i + 1;
}
// ---- Allocate device memory ----
cudaStream_t stream = cuda_create_stream(gpu_index);
auto *d_points =
static_cast<G1Affine *>(cuda_malloc(n * sizeof(G1Affine), gpu_index));
auto *d_scalars =
static_cast<Scalar *>(cuda_malloc(n * sizeof(Scalar), gpu_index));
// Use pippenger_scratch_size_g1() to compute the required scratch allocation.
size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
auto *d_scratch =
static_cast<G1Projective *>(cuda_malloc(scratch_bytes, gpu_index));
// ---- Copy inputs to the GPU ----
cuda_memcpy_async_to_gpu(d_points, h_points.data(), n * sizeof(G1Affine),
stream, gpu_index);
cuda_memcpy_async_to_gpu(d_scalars, h_scalars.data(), n * sizeof(Scalar),
stream, gpu_index);
// ---- Run MSM (synchronous wrapper; result written directly to host) ----
G1Projective h_result;
point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch,
size_tracker, true);
// ---- Verify against naive sequential computation on the host ----
// Expected = sum over i of (scalar[i] * point[i]).
// Use projective * Scalar operator; host-side affine scalar_mul is internal
// only.
G1Projective expected;
g1_projective_point_at_infinity(expected);
for (uint32_t i = 0; i < n; i++) {
G1Projective term_proj;
affine_to_projective(term_proj, h_points[i]);
expected = expected + term_proj * h_scalars[i];
}
// Normalise to Z = 1 (Montgomery) before comparing projective coordinates.
normalize_projective_g1(h_result);
normalize_projective_g1(expected);
assert(h_result == expected);
printf("MSM result matches naive sequential computation.\n");
// ---- Cleanup ----
cuda_drop(d_points, gpu_index);
cuda_drop(d_scalars, gpu_index);
cuda_drop(d_scratch, gpu_index);
cuda_destroy_stream(stream, gpu_index);
printf("All MSM basic operations passed.\n");
return 0;
}

View File

@@ -6,8 +6,6 @@
#include "fp2.h"
#include <cuda_runtime.h>
#include "checked_arithmetic.h"
// ============================================================================
// CUDA Kernels for parallel Fp2 operations (test-only)
// ============================================================================
@@ -113,21 +111,16 @@ void fp2_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,
const uint32_t blocksPerGrid = CEIL_DIV(n, threadsPerBlock);
auto *d_c = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
auto *d_a = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
auto *d_b = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
cuda_memcpy_with_size_tracking_async_to_gpu(
d_a, a, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_b, b, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp2), stream,
gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp2), stream,
gpu_index, true);
kernel_fp2_add_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
d_b, n);
@@ -136,8 +129,7 @@ void fp2_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,
cuda_synchronize_stream(stream, gpu_index);
cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)),
stream, gpu_index);
cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp2), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
if (d_c != nullptr) {
@@ -167,21 +159,16 @@ void fp2_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,
const uint32_t blocksPerGrid = CEIL_DIV(n, threadsPerBlock);
auto *d_c = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
auto *d_a = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
auto *d_b = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
cuda_memcpy_with_size_tracking_async_to_gpu(
d_a, a, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_b, b, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp2), stream,
gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp2), stream,
gpu_index, true);
kernel_fp2_mul_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
d_b, n);
@@ -190,8 +177,7 @@ void fp2_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,
cuda_synchronize_stream(stream, gpu_index);
cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)),
stream, gpu_index);
cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp2), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
if (d_c != nullptr) {

View File

@@ -6,8 +6,6 @@
#include "fp.h"
#include <cuda_runtime.h>
#include "checked_arithmetic.h"
// ============================================================================
// CUDA Kernels for parallel Fp operations (test-only)
// ============================================================================
@@ -175,22 +173,17 @@ void fp_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,
// Allocate device memory (asynchronous with stream)
auto *d_c = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp), stream, gpu_index, size_tracker, true));
auto *d_a = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp), stream, gpu_index, size_tracker, true));
auto *d_b = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp), stream, gpu_index, size_tracker, true));
// Copy to device (asynchronous with stream)
cuda_memcpy_with_size_tracking_async_to_gpu(
d_a, a, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_b, b, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp), stream,
gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp), stream,
gpu_index, true);
// Launch kernel (with stream)
kernel_fp_add_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
@@ -203,8 +196,7 @@ void fp_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,
cuda_synchronize_stream(stream, gpu_index);
// Copy back (synchronous after stream sync)
cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp>(static_cast<size_t>(n)),
stream, gpu_index);
cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Free device memory (asynchronous with stream)
@@ -240,22 +232,17 @@ void fp_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,
// Allocate device memory (asynchronous with stream)
auto *d_c = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp), stream, gpu_index, size_tracker, true));
auto *d_a = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp), stream, gpu_index, size_tracker, true));
auto *d_b = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
size_tracker, true));
n * sizeof(Fp), stream, gpu_index, size_tracker, true));
// Copy to device (asynchronous with stream)
cuda_memcpy_with_size_tracking_async_to_gpu(
d_a, a, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_b, b, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp), stream,
gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp), stream,
gpu_index, true);
// Launch kernel (with stream)
kernel_fp_mul_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
@@ -268,8 +255,7 @@ void fp_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,
cuda_synchronize_stream(stream, gpu_index);
// Copy back (synchronous after stream sync)
cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp>(static_cast<size_t>(n)),
stream, gpu_index);
cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Free device memory (asynchronous with stream)

View File

@@ -355,11 +355,9 @@ TEST_F(FpArithmeticTest, Multiplication) {
fp_mul_gpu(stream, gpu_index, &result, &five, &three);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp five_m, three_m;
fp_to_montgomery(five_m, five);
fp_to_montgomery(three_m, three);
Fp result_cpu_mont = five_m * three_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = five * three;
Fp result_cpu;
fp_from_montgomery(result_cpu, result_cpu_mont);
@@ -574,11 +572,9 @@ TEST_F(FpArithmeticTest, MultiplicationByZero) {
fp_mul_gpu(stream, gpu_index, &result, &a, &zero);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, zero_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(zero_m, zero);
Fp result_cpu_mont = a_m * zero_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = a * zero;
fp_from_montgomery(result_cpu, result_cpu_mont);
EXPECT_TRUE(fp_is_zero_gpu(stream, gpu_index, &result));
@@ -602,13 +598,10 @@ TEST_F(FpArithmeticTest, Inversion) {
fp_mul_gpu(stream, gpu_index, &result, &a, &a_inv);
// Also test on CPU for comparison
// fp_inv returns normal form, convert both operands to Montgomery for
// operator*
fp_inv(a_inv_cpu, a);
Fp a_m, a_inv_cpu_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(a_inv_cpu_m, a_inv_cpu);
Fp result_cpu_mont = a_m * a_inv_cpu_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = a * a_inv_cpu;
Fp result_cpu;
fp_from_montgomery(result_cpu, result_cpu_mont);
@@ -657,13 +650,11 @@ TEST_F(FpArithmeticTest, Division) {
fp_div_gpu(stream, gpu_index, &quotient, &a, &b);
fp_mul_gpu(stream, gpu_index, &result, &quotient, &b);
// operator/ now expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(b_m, b);
Fp quotient_cpu_m = a_m / b_m;
// quotient_cpu_m * b_m should give a_m back
Fp result_cpu_mont = quotient_cpu_m * b_m;
// Also test on CPU for comparison
Fp quotient_cpu = a / b;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = quotient_cpu * b;
Fp result_cpu;
fp_from_montgomery(result_cpu, result_cpu_mont);
@@ -688,13 +679,8 @@ TEST_F(FpArithmeticTest, DivisionByOne) {
// Test on GPU
fp_div_gpu(stream, gpu_index, &result, &a, &one);
// operator/ expects Montgomery-form inputs, returns Montgomery form
Fp a_m, one_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(one_m, one);
Fp result_cpu_m = a_m / one_m;
Fp result_cpu;
fp_from_montgomery(result_cpu, result_cpu_m);
// Also test on CPU for comparison
Fp result_cpu = a / one;
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &a), ComparisonType::Equal)
<< "a / 1 should equal a";
@@ -831,10 +817,9 @@ TEST_F(FpArithmeticTest, SquareRoot) {
fp_mul_gpu(stream, gpu_index, &square, &a, &a);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m;
fp_to_montgomery(a_m, a);
Fp square_cpu_mont = a_m * a_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp square_cpu_mont = a * a;
fp_from_montgomery(square_cpu, square_cpu_mont);
// Verify that square is a quadratic residue (on GPU)
@@ -853,11 +838,10 @@ TEST_F(FpArithmeticTest, SquareRoot) {
cuda_synchronize_stream(stream, gpu_index);
// Also test on CPU for comparison
// fp_sqrt returns normal form; convert to Montgomery for operator*
fp_sqrt(sqrt_result_cpu, square_cpu);
Fp sqrt_result_cpu_m;
fp_to_montgomery(sqrt_result_cpu_m, sqrt_result_cpu);
Fp verify_cpu_mont = sqrt_result_cpu_m * sqrt_result_cpu_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp verify_cpu_mont = sqrt_result_cpu * sqrt_result_cpu;
fp_from_montgomery(verify_cpu, verify_cpu_mont);
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &verify, &square),
@@ -946,10 +930,9 @@ TEST_F(FpArithmeticTest, IsQuadraticResidue) {
fp_mul_gpu(stream, gpu_index, &square, &a, &a);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m;
fp_to_montgomery(a_m, a);
Fp square_cpu_mont = a_m * a_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp square_cpu_mont = a * a;
fp_from_montgomery(square_cpu, square_cpu_mont);
EXPECT_TRUE(fp_is_quadratic_residue_gpu(stream, gpu_index, &square))
@@ -1164,11 +1147,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication1) {
fp_mul_gpu(stream, gpu_index, &verify, &result, &one);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(b_m, b);
Fp result_cpu_mont = a_m * b_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = a * b;
fp_from_montgomery(result_cpu, result_cpu_mont);
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &verify),
@@ -1209,11 +1190,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
fp_mul_gpu(stream, gpu_index, &result, &a, &b);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(b_m, b);
Fp result_cpu_mont = a_m * b_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = a * b;
fp_from_montgomery(result_cpu, result_cpu_mont);
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &expected),
@@ -1248,11 +1227,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
fp_add_gpu(stream, gpu_index, &expected, &a, &a);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(b_m, b);
Fp result_cpu_mont = a_m * b_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = a * b;
fp_from_montgomery(result_cpu, result_cpu_mont);
expected_cpu = a + a;
@@ -1282,10 +1259,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
fp_mul_gpu(stream, gpu_index, &verify, &result, &one);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m;
fp_to_montgomery(a_m, a);
Fp result_cpu_mont = a_m * a_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = a * a;
fp_from_montgomery(result_cpu, result_cpu_mont);
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &verify),
@@ -1354,11 +1330,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
fp_mul_gpu(stream, gpu_index, &verify, &result, &one);
// Also test on CPU for comparison
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, a);
fp_to_montgomery(b_m, b);
Fp result_cpu_mont = a_m * b_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp result_cpu_mont = a * b;
fp_from_montgomery(result_cpu, result_cpu_mont);
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &verify),
@@ -2138,11 +2112,9 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
h_a[i] = test_utils::random_fp(rng);
h_b[i] = test_utils::random_fp(rng);
// Compute expected result on host
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, h_a[i]);
fp_to_montgomery(b_m, h_b[i]);
Fp expected_mont = a_m * b_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp expected_mont = h_a[i] * h_b[i];
fp_from_montgomery(h_expected[i], expected_mont);
}
@@ -2243,11 +2215,9 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
h_b[i] = test_utils::random_fp(rng);
}
// Compute expected result on host
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, h_a[i]);
fp_to_montgomery(b_m, h_b[i]);
Fp expected_mont = a_m * b_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp expected_mont = h_a[i] * h_b[i];
fp_from_montgomery(h_expected[i], expected_mont);
}
@@ -2398,11 +2368,9 @@ TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
for (int i = 0; i < n; i++) {
h_a[i] = test_utils::random_fp(rng);
h_b[i] = test_utils::random_fp(rng);
// operator* expects Montgomery-form inputs and returns Montgomery form
Fp a_m, b_m;
fp_to_montgomery(a_m, h_a[i]);
fp_to_montgomery(b_m, h_b[i]);
Fp expected_mont = a_m * b_m;
// operator* returns result in Montgomery form, convert to normal for
// comparison
Fp expected_mont = h_a[i] * h_b[i];
fp_from_montgomery(h_expected[i], expected_mont);
}

View File

@@ -0,0 +1,148 @@
#include "device.h"
#include "fp.h"
#include <cuda_runtime.h>
#include <gtest/gtest.h>
// Kernel that calls fp_one_montgomery INSIDE the device kernel
__global__ void kernel_fp_one_montgomery_device(Fp *result) {
fp_one_montgomery(*result);
}
// Kernel that calls fp_to_montgomery INSIDE the device kernel
__global__ void kernel_fp_to_montgomery_device(Fp *result) {
Fp one;
one.limb[0] = 1;
for (int i = 1; i < FP_LIMBS; i++) {
one.limb[i] = 0;
}
fp_to_montgomery(*result, one);
}
// Kernel that manually sets hardcoded Z value
__global__ void kernel_hardcoded_z(Fp *result) {
result->limb[0] = 0x3b8fff65553d5554ULL;
result->limb[1] = 0xa446eb5cea3128cfULL;
result->limb[2] = 0xf6c648f07714c846ULL;
result->limb[3] = 0xc22966d114e3a7f5ULL;
result->limb[4] = 0xfda96d21d7f40737ULL;
result->limb[5] = 0x7fc0f2da6954a6ffULL;
result->limb[6] = 0x0c847c135ce86b2bULL;
}
TEST(FpDeviceCall, FpOneMontgomeryInKernel) {
uint64_t size_tracker = 0;
if (!cuda_is_available()) {
GTEST_SKIP() << "CUDA not available";
}
uint32_t gpu_index = 0;
auto stream = cuda_create_stream(gpu_index);
auto *d_result = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
sizeof(Fp), stream, gpu_index, size_tracker, true));
// Call fp_one_montgomery INSIDE device kernel
kernel_fp_one_montgomery_device<<<1, 1, 0, stream>>>(d_result);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
Fp h_result;
cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(Fp), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Check if result is zero
bool is_zero = fp_is_zero(h_result);
std::cout << "fp_one_montgomery (called IN device kernel) result:"
<< std::endl;
for (int i = 0; i < 7; i++) {
std::cout << " limb[" << i << "] = 0x" << std::hex << h_result.limb[i]
<< std::dec << std::endl;
}
std::cout << "Is zero: " << (is_zero ? "YES - BUG!" : "no") << std::endl;
EXPECT_FALSE(is_zero)
<< "fp_one_montgomery should NOT return zero when called from device!";
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
cuda_destroy_stream(stream, gpu_index);
}
TEST(FpDeviceCall, FpToMontgomeryInKernel) {
uint64_t size_tracker = 0;
if (!cuda_is_available()) {
GTEST_SKIP() << "CUDA not available";
}
uint32_t gpu_index = 0;
auto stream = cuda_create_stream(gpu_index);
auto *d_result = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
sizeof(Fp), stream, gpu_index, size_tracker, true));
// Call fp_to_montgomery INSIDE device kernel
kernel_fp_to_montgomery_device<<<1, 1, 0, stream>>>(d_result);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
Fp h_result;
cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(Fp), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Check if result is zero
bool is_zero = fp_is_zero(h_result);
std::cout << "fp_to_montgomery(1) (called IN device kernel) result:"
<< std::endl;
for (int i = 0; i < 7; i++) {
std::cout << " limb[" << i << "] = 0x" << std::hex << h_result.limb[i]
<< std::dec << std::endl;
}
std::cout << "Is zero: " << (is_zero ? "YES - BUG!" : "no") << std::endl;
EXPECT_FALSE(is_zero)
<< "fp_to_montgomery(1) should NOT return zero when called from device!";
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
cuda_destroy_stream(stream, gpu_index);
}
TEST(FpDeviceCall, HardcodedZValue) {
uint64_t size_tracker = 0;
if (!cuda_is_available()) {
GTEST_SKIP() << "CUDA not available";
}
uint32_t gpu_index = 0;
auto stream = cuda_create_stream(gpu_index);
auto *d_result = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
sizeof(Fp), stream, gpu_index, size_tracker, true));
// Set hardcoded Z value INSIDE device kernel
kernel_hardcoded_z<<<1, 1, 0, stream>>>(d_result);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
Fp h_result;
cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(Fp), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Check if result is zero
bool is_zero = fp_is_zero(h_result);
std::cout << "Hardcoded Z (set IN device kernel) result:" << std::endl;
for (int i = 0; i < 7; i++) {
std::cout << " limb[" << i << "] = 0x" << std::hex << h_result.limb[i]
<< std::dec << std::endl;
}
std::cout << "Is zero: " << (is_zero ? "YES - BUG!" : "no") << std::endl;
EXPECT_FALSE(is_zero) << "Hardcoded Z value should NOT be zero!";
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
cuda_destroy_stream(stream, gpu_index);
}

View File

@@ -95,37 +95,40 @@ protected:
cudaStream_t stream;
// Helper to check if a point is on the curve y^2 = x^3 + b
// All arithmetic stays in Montgomery form (point coordinates are already
// Montgomery), converted to normal only for debug output.
bool is_on_curve(const G1Affine &point) {
if (point.infinity) {
return true; // Point at infinity is on the curve
}
// Coordinates are already in Montgomery form
const Fp &x_m = point.x;
const Fp &y_m = point.y;
// Convert from Montgomery form to normal form for verification
Fp x_normal, y_normal;
fp_from_montgomery(x_normal, point.x);
fp_from_montgomery(y_normal, point.y);
// Compute y^2 in Montgomery form
Fp y_squared_m = y_m * y_m;
// Compute y^2 (operator* returns Montgomery form, convert to normal)
Fp y_squared_mont = y_normal * y_normal;
Fp y_squared;
fp_from_montgomery(y_squared, y_squared_mont);
// Compute x^3 in Montgomery form
Fp x_squared_m = x_m * x_m;
Fp x_cubed_m = x_squared_m * x_m;
// Compute x^3 (operator* returns Montgomery form, convert to normal)
Fp x_squared_mont = x_normal * x_normal;
Fp x_squared;
fp_from_montgomery(x_squared, x_squared_mont);
Fp x_cubed_mont = x_squared * x_normal;
Fp x_cubed;
fp_from_montgomery(x_cubed, x_cubed_mont);
// Compute x^3 + b in Montgomery form (b = 1)
Fp b_m;
fp_one_montgomery(b_m);
Fp x_cubed_plus_b_m = x_cubed_m + b_m;
// Compute x^3 + b (b = 1)
Fp b;
fp_zero(b);
b.limb[0] = 1;
Fp x_cubed_plus_b = x_cubed + b;
// Check if y^2 == x^3 + b (comparison works directly in Montgomery form)
bool on_curve = y_squared_m == x_cubed_plus_b_m;
// Check if y^2 == x^3 + b
bool on_curve = y_squared == x_cubed_plus_b;
// Debug output if not on curve (convert to normal form for printing)
// Debug output if not on curve
if (!on_curve) {
Fp y_squared, x_cubed_plus_b;
fp_from_montgomery(y_squared, y_squared_m);
fp_from_montgomery(x_cubed_plus_b, x_cubed_plus_b_m);
std::cout << "WARNING: Point is NOT on the curve!" << std::endl;
print_fp(" y^2", y_squared);
print_fp(" x^3 + b", x_cubed_plus_b);

View File

@@ -0,0 +1,145 @@
#include "curve.h"
#include "device.h"
#include <gtest/gtest.h>
#include <iostream>
// Test fixture for projective coordinate operations
class ProjectiveTest : public ::testing::Test {
protected:
void *stream;
int gpu_index;
void SetUp() override {
gpu_index = 0;
stream = cuda_stream_create(gpu_index);
}
void TearDown() override { cuda_stream_destroy(stream, gpu_index); }
};
// Test: Convert affine -> projective -> affine (round trip)
TEST_F(ProjectiveTest, G1RoundTrip) {
uint64_t size_tracker = 0;
// Get generator in Montgomery form
G1Affine G = g1_generator();
G1Affine G_mont = G;
point_to_montgomery_inplace(G_mont);
// Convert to projective
G1ProjectivePoint G_proj;
affine_to_projective(G_proj, G_mont);
// Convert back to affine
G1Affine G_back;
projective_to_affine_g1(G_back, G_proj);
// Convert from Montgomery
G1Affine G_result;
fp_from_montgomery(G_result.x, G_back.x);
fp_from_montgomery(G_result.y, G_back.y);
G_result.infinity = G_back.infinity;
// Compare
EXPECT_EQ(fp_cmp(G_result.x, G.x), FpComparison::Equal)
<< "X coordinate mismatch in round-trip";
EXPECT_EQ(fp_cmp(G_result.y, G.y), FpComparison::Equal)
<< "Y coordinate mismatch in round-trip";
EXPECT_EQ(G_result.infinity, G.infinity) << "Infinity flag mismatch";
}
// Test: Projective doubling vs affine doubling
TEST_F(ProjectiveTest, G1DoublingVsAffine) {
uint64_t size_tracker = 0;
// Get generator in Montgomery form
G1Affine G = g1_generator();
G1Affine G_mont = G;
point_to_montgomery_inplace(G_mont);
G_mont.infinity = false;
// Affine doubling: 2*G using existing point_add
G1Affine *d_G = (G1Affine *)cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true);
G1Affine *d_2G_affine = (G1Affine *)cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_G, &G_mont, sizeof(G1Affine),
stream, gpu_index, true);
point_add<G1Affine>(stream, gpu_index, d_2G_affine, d_G, d_G);
G1Affine result_affine;
cuda_memcpy_async_to_cpu(&result_affine, d_2G_affine, sizeof(G1Affine),
stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Projective doubling
G1ProjectivePoint G_proj;
affine_to_projective(G_proj, G_mont);
G1ProjectivePoint G2_proj;
projective_point_double(G2_proj, G_proj);
// Convert back to affine
G1Affine result_proj;
projective_to_affine_g1(result_proj, G2_proj);
// Compare (both are in Montgomery form)
EXPECT_EQ(fp_cmp(result_proj.x, result_affine.x), FpComparison::Equal)
<< "X coordinate mismatch: projective doubling vs affine doubling";
EXPECT_EQ(fp_cmp(result_proj.y, result_affine.y), FpComparison::Equal)
<< "Y coordinate mismatch: projective doubling vs affine doubling";
cuda_drop_with_size_tracking_async(d_G, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_2G_affine, stream, gpu_index, true);
}
// Test: Projective addition vs affine addition
TEST_F(ProjectiveTest, G1AdditionVsAffine) {
uint64_t size_tracker = 0;
// Get generator in Montgomery form
G1Affine G = g1_generator();
G1Affine G_mont = G;
point_to_montgomery_inplace(G_mont);
G_mont.infinity = false;
// Compute 2*G in affine
G1Affine *d_G = (G1Affine *)cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true);
G1Affine *d_2G = (G1Affine *)cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_G, &G_mont, sizeof(G1Affine),
stream, gpu_index, true);
point_add<G1Affine>(stream, gpu_index, d_2G, d_G, d_G);
G1Affine G2_mont;
cuda_memcpy_async_to_cpu(&G2_mont, d_2G, sizeof(G1Affine), stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Compute G + 2G = 3G in affine
G1Affine *d_3G_affine = (G1Affine *)cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true);
point_add<G1Affine>(stream, gpu_index, d_3G_affine, d_G, d_2G);
G1Affine result_affine;
cuda_memcpy_async_to_cpu(&result_affine, d_3G_affine, sizeof(G1Affine),
stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Compute G + 2G = 3G in projective
G1ProjectivePoint G_proj, G2_proj, G3_proj;
affine_to_projective(G_proj, G_mont);
affine_to_projective(G2_proj, G2_mont);
projective_point_add(G3_proj, G_proj, G2_proj);
// Convert back to affine
G1Affine result_proj;
projective_to_affine_g1(result_proj, G3_proj);
// Compare (both are in Montgomery form)
EXPECT_EQ(fp_cmp(result_proj.x, result_affine.x), FpComparison::Equal)
<< "X coordinate mismatch: projective addition vs affine addition";
EXPECT_EQ(fp_cmp(result_proj.y, result_affine.y), FpComparison::Equal)
<< "Y coordinate mismatch: projective addition vs affine addition";
cuda_drop_with_size_tracking_async(d_G, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_2G, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_3G_affine, stream, gpu_index, true);
}

View File

@@ -0,0 +1,379 @@
#include "curve.h"
#include "device.h"
#include "fp.h"
#include "fp2.h"
#include <cstdint>
#include <cstring>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
// Test fixture for scalar multiplication tests
class ScalarMulTest : public ::testing::Test {
protected:
void SetUp() override {
// Initialize CUDA
if (!cuda_is_available()) {
GTEST_SKIP() << "CUDA not available";
}
gpu_index = 0;
stream = cuda_create_stream(gpu_index);
// Device generators are now hardcoded at compile time, no initialization
// needed
}
void TearDown() override {
if (stream != nullptr) {
cuda_destroy_stream(stream, gpu_index);
}
}
uint32_t gpu_index;
cudaStream_t stream;
};
// Test scalar multiplication by using MSM with a single point
// This tests the building block projective_scalar_mul indirectly
// MSM with n=1 calls projective_scalar_mul internally
// Test G1 scalar multiplication: scalar = 1 (should return point itself)
TEST_F(ScalarMulTest, G1ScalarMulOne) {
uint64_t size_tracker = 0;
// Get generator point
const G1Affine &G = g1_generator();
if (g1_is_infinity(G)) {
GTEST_SKIP() << "G1 generator not set";
}
// Convert to Montgomery form
G1Affine G_mont = G;
point_to_montgomery_inplace(G_mont);
G_mont.infinity = false;
// Create scalar = 1
Scalar scalar_one;
scalar_one.limb[0] = 1;
for (int i = 1; i < 5; i++) {
scalar_one.limb[i] = 0;
}
// Allocate device memory
auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true));
auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
sizeof(Scalar), stream, gpu_index, size_tracker, true));
auto *d_result =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Projective), stream, gpu_index, size_tracker, true));
// Copy to device
cuda_memcpy_with_size_tracking_async_to_gpu(
d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_scalar, &scalar_one, sizeof(Scalar), stream, gpu_index, true);
// Test scalar multiplication using MSM with single point (tests
// projective_scalar_mul)
int threadsPerBlock = 256;
int num_blocks = 1;
size_t scratch_size =
(num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
auto *d_scratch =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
scratch_size, stream, gpu_index, size_tracker, true));
point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
size_tracker);
check_cuda_error(cudaGetLastError());
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
// Copy result back
G1Projective h_result;
cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Convert projective to affine
G1Affine result_affine;
projective_to_affine_g1(result_affine, h_result);
// Convert from Montgomery to normal form
G1Affine result_normal;
fp_from_montgomery(result_normal.x, result_affine.x);
fp_from_montgomery(result_normal.y, result_affine.y);
result_normal.infinity = result_affine.infinity;
// Check: result should be the same as input (scalar = 1)
EXPECT_FALSE(result_normal.infinity)
<< "Result should not be at infinity for scalar=1";
EXPECT_EQ(fp_cmp(result_normal.x, G.x), FpComparison::Equal)
<< "x-coordinate should match input point";
EXPECT_EQ(fp_cmp(result_normal.y, G.y), FpComparison::Equal)
<< "y-coordinate should match input point";
// Cleanup
cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
}
// Test G1 scalar multiplication: scalar = 0 (should return infinity)
TEST_F(ScalarMulTest, G1ScalarMulZero) {
uint64_t size_tracker = 0;
// Get generator point
const G1Affine &G = g1_generator();
if (g1_is_infinity(G)) {
GTEST_SKIP() << "G1 generator not set";
}
// Convert to Montgomery form
G1Affine G_mont = G;
point_to_montgomery_inplace(G_mont);
// Create scalar = 0
Scalar scalar_zero;
std::memset(scalar_zero.limb, 0, sizeof(scalar_zero.limb));
// Allocate device memory
auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true));
auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
sizeof(Scalar), stream, gpu_index, size_tracker, true));
auto *d_result =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Projective), stream, gpu_index, size_tracker, true));
// Copy to device
cuda_memcpy_with_size_tracking_async_to_gpu(
d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_scalar, &scalar_zero, sizeof(Scalar), stream, gpu_index, true);
// Test scalar multiplication using MSM with single point (tests
// projective_scalar_mul)
int threadsPerBlock = 256;
int num_blocks = 1;
size_t scratch_size =
(num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
auto *d_scratch =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
scratch_size, stream, gpu_index, size_tracker, true));
point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
size_tracker);
check_cuda_error(cudaGetLastError());
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
// Copy result back
G1Projective h_result;
cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Check: result should be at infinity (Z = 0)
EXPECT_TRUE(fp_is_zero(h_result.Z))
<< "Result should be at infinity for scalar=0";
// Cleanup
cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
}
// Test G1 scalar multiplication: scalar = 2 (should return 2*point)
TEST_F(ScalarMulTest, G1ScalarMulTwo) {
uint64_t size_tracker = 0;
// Get generator point
const G1Affine &G = g1_generator();
if (g1_is_infinity(G)) {
GTEST_SKIP() << "G1 generator not set";
}
// Convert to Montgomery form
G1Affine G_mont = G;
point_to_montgomery_inplace(G_mont);
// Create scalar = 2
Scalar scalar_two;
scalar_two.limb[0] = 2;
for (int i = 1; i < 5; i++) {
scalar_two.limb[i] = 0;
}
// Allocate device memory
auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true));
auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
sizeof(Scalar), stream, gpu_index, size_tracker, true));
auto *d_result =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Projective), stream, gpu_index, size_tracker, true));
auto *d_expected =
static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true));
// Copy to device
cuda_memcpy_with_size_tracking_async_to_gpu(
d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_scalar, &scalar_two, sizeof(Scalar), stream, gpu_index, true);
// Test scalar multiplication using MSM with single point (tests
// projective_scalar_mul)
int threadsPerBlock = 256;
int num_blocks = 1;
size_t scratch_size =
(num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
auto *d_scratch =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
scratch_size, stream, gpu_index, size_tracker, true));
point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
size_tracker);
check_cuda_error(cudaGetLastError());
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
// Compute expected result: 2*G using point doubling
single_point_scalar_mul<G1Affine>(stream, gpu_index, d_expected, d_point, 2);
// Synchronize and copy results back
cuda_synchronize_stream(stream, gpu_index);
G1Projective h_result;
cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
gpu_index);
G1Affine h_expected;
cuda_memcpy_async_to_cpu(&h_expected, d_expected, sizeof(G1Affine), stream,
gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Convert projective result to affine
G1Affine result_affine;
projective_to_affine_g1(result_affine, h_result);
// Convert from Montgomery to normal form
G1Affine result_normal, expected_normal;
fp_from_montgomery(result_normal.x, result_affine.x);
fp_from_montgomery(result_normal.y, result_affine.y);
result_normal.infinity = result_affine.infinity;
fp_from_montgomery(expected_normal.x, h_expected.x);
fp_from_montgomery(expected_normal.y, h_expected.y);
expected_normal.infinity = h_expected.infinity;
// Check: result should match expected (2*G)
EXPECT_EQ(result_normal.infinity, expected_normal.infinity)
<< "Infinity flag should match";
if (!result_normal.infinity && !expected_normal.infinity) {
EXPECT_EQ(fp_cmp(result_normal.x, expected_normal.x), FpComparison::Equal)
<< "x-coordinate should match 2*G";
EXPECT_EQ(fp_cmp(result_normal.y, expected_normal.y), FpComparison::Equal)
<< "y-coordinate should match 2*G";
}
// Cleanup
cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_expected, stream, gpu_index, true);
}
// Test G1 scalar multiplication: scalar = 3 (should return 3*point = point +
// 2*point)
TEST_F(ScalarMulTest, G1ScalarMulThree) {
uint64_t size_tracker = 0;
// Get generator point
const G1Affine &G = g1_generator();
if (g1_is_infinity(G)) {
GTEST_SKIP() << "G1 generator not set";
}
// Convert to Montgomery form
G1Affine G_mont = G;
point_to_montgomery_inplace(G_mont);
// Create scalar = 3
Scalar scalar_three;
scalar_three.limb[0] = 3;
for (int i = 1; i < 5; i++) {
scalar_three.limb[i] = 0;
}
// Allocate device memory
auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true));
auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
sizeof(Scalar), stream, gpu_index, size_tracker, true));
auto *d_result =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Projective), stream, gpu_index, size_tracker, true));
auto *d_expected =
static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
sizeof(G1Affine), stream, gpu_index, size_tracker, true));
// Copy to device
cuda_memcpy_with_size_tracking_async_to_gpu(
d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_scalar, &scalar_three, sizeof(Scalar), stream, gpu_index, true);
// Test scalar multiplication using MSM with single point (tests
// projective_scalar_mul)
int threadsPerBlock = 256;
int num_blocks = 1;
size_t scratch_size =
(num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
auto *d_scratch =
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
scratch_size, stream, gpu_index, size_tracker, true));
point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
size_tracker);
check_cuda_error(cudaGetLastError());
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
// Compute expected result: 3*G using u64 scalar multiplication
single_point_scalar_mul<G1Affine>(stream, gpu_index, d_expected, d_point, 3);
// Synchronize and copy results back
cuda_synchronize_stream(stream, gpu_index);
G1Projective h_result;
cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
gpu_index);
G1Affine h_expected;
cuda_memcpy_async_to_cpu(&h_expected, d_expected, sizeof(G1Affine), stream,
gpu_index);
cuda_synchronize_stream(stream, gpu_index);
// Convert projective result to affine
G1Affine result_affine;
projective_to_affine_g1(result_affine, h_result);
// Convert from Montgomery to normal form
G1Affine result_normal, expected_normal;
fp_from_montgomery(result_normal.x, result_affine.x);
fp_from_montgomery(result_normal.y, result_affine.y);
result_normal.infinity = result_affine.infinity;
fp_from_montgomery(expected_normal.x, h_expected.x);
fp_from_montgomery(expected_normal.y, h_expected.y);
expected_normal.infinity = h_expected.infinity;
// Check: result should match expected (3*G)
EXPECT_EQ(result_normal.infinity, expected_normal.infinity)
<< "Infinity flag should match";
if (!result_normal.infinity && !expected_normal.infinity) {
EXPECT_EQ(fp_cmp(result_normal.x, expected_normal.x), FpComparison::Equal)
<< "x-coordinate should match 3*G";
EXPECT_EQ(fp_cmp(result_normal.y, expected_normal.y), FpComparison::Equal)
<< "y-coordinate should match 3*G";
}
// Cleanup
cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_expected, stream, gpu_index, true);
}

View File

@@ -139,37 +139,31 @@ unsafe extern "C" {
pub fn g2_is_infinity_wrapper(point: *const G2Point) -> bool;
}
unsafe extern "C" {
pub fn g1_msm_unmanaged_wrapper_async(
pub fn g1_msm_unmanaged_wrapper(
stream: cudaStream_t,
gpu_index: u32,
h_result: *mut G1ProjectivePoint,
d_result: *mut G1ProjectivePoint,
d_points: *const G1Point,
d_scalars: *const Scalar,
n: u32,
d_scratch: *mut G1ProjectivePoint,
gpu_memory_allocated: bool,
n: u32,
points_in_montgomery: bool,
size_tracker: *mut u64,
);
}
unsafe extern "C" {
pub fn g2_msm_unmanaged_wrapper_async(
pub fn g2_msm_unmanaged_wrapper(
stream: cudaStream_t,
gpu_index: u32,
h_result: *mut G2ProjectivePoint,
d_result: *mut G2ProjectivePoint,
d_points: *const G2Point,
d_scalars: *const Scalar,
n: u32,
d_scratch: *mut G2ProjectivePoint,
gpu_memory_allocated: bool,
n: u32,
points_in_montgomery: bool,
size_tracker: *mut u64,
);
}
unsafe extern "C" {
pub fn pippenger_scratch_size_g1_wrapper(n: u32, gpu_index: u32) -> usize;
}
unsafe extern "C" {
pub fn pippenger_scratch_size_g2_wrapper(n: u32, gpu_index: u32) -> usize;
}
unsafe extern "C" {
pub fn g1_msm_managed_wrapper(
stream: cudaStream_t,

View File

@@ -1,7 +1,6 @@
// C wrapper functions for Rust FFI
// These functions provide a C-compatible interface to the C++ functions
#include "checked_arithmetic.h"
#include "curve.h"
#include "device.h"
#include "msm.h"
@@ -63,77 +62,109 @@ bool g2_is_infinity_wrapper(const G2Affine* point) {
return g2_is_infinity(*point);
}
// Unmanaged MSM wrapper for G1 (points/scalars/scratch on device, result on host)
// Points MUST be in Montgomery form. Caller provides scratch buffer and
// controls allocation tracking via gpu_memory_allocated.
// Zero internal allocations — this is a thin validation + dispatch layer.
void g1_msm_unmanaged_wrapper_async(
// Unmanaged MSM wrapper for G1 (assumes all data is already on device)
// If points_in_montgomery is false, a temporary copy will be made and converted.
// For best performance, provide points already in Montgomery form to avoid allocation overhead.
// NOTE: This wrapper synchronizes the stream before returning — callers do not need to sync.
void g1_msm_unmanaged_wrapper(
cudaStream_t stream,
uint32_t gpu_index,
G1Projective* h_result,
G1Projective* d_result,
const G1Affine* d_points,
const Scalar* d_scalars,
uint32_t n,
G1Projective* d_scratch,
bool gpu_memory_allocated,
uint32_t n,
bool points_in_montgomery,
uint64_t* size_tracker
) {
PANIC_IF_FALSE(size_tracker != nullptr, "G1 MSM error: size_tracker is null");
uint64_t& size_tracker_ref = *size_tracker;
PANIC_IF_FALSE(n > 0, "G1 MSM error: n must be positive, got %u", n);
PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
PANIC_IF_FALSE(h_result != nullptr, "G1 MSM error: h_result is null");
PANIC_IF_FALSE(d_result != nullptr, "G1 MSM error: d_result is null");
PANIC_IF_FALSE(d_points != nullptr, "G1 MSM error: d_points is null");
PANIC_IF_FALSE(d_scalars != nullptr, "G1 MSM error: d_scalars is null");
PANIC_IF_FALSE(d_scratch != nullptr, "G1 MSM error: d_scratch is null");
PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
"G1 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
cuda_get_number_of_gpus());
point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
d_scratch, size_tracker_ref, gpu_memory_allocated);
const G1Affine* points_to_use = d_points;
G1Affine* d_points_converted = nullptr;
if (!points_in_montgomery) {
size_t points_bytes = 0;
bool overflow = __builtin_mul_overflow((size_t)n, sizeof(G1Affine), &points_bytes);
PANIC_IF_FALSE(!overflow,
"G1 MSM unmanaged error: points byte size overflow (n=%u)", n);
d_points_converted = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
PANIC_IF_FALSE(d_points_converted != nullptr, "G1 MSM error: failed to allocate memory for Montgomery conversion");
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(d_points_converted, d_points, points_bytes, stream, gpu_index, true);
convert_g1_points_to_montgomery(stream, gpu_index, d_points_converted, n);
check_cuda_error(cudaGetLastError());
points_to_use = d_points_converted;
}
point_msm_async_g1(stream, gpu_index, d_result, points_to_use, d_scalars, d_scratch, n, size_tracker_ref);
check_cuda_error(cudaGetLastError());
if (d_points_converted != nullptr) {
cuda_drop_with_size_tracking_async(d_points_converted, stream, gpu_index, true);
}
cuda_synchronize_stream(stream, gpu_index);
}
// Unmanaged MSM wrapper for G2 (points/scalars/scratch on device, result on host)
// Points MUST be in Montgomery form. Caller provides scratch buffer and
// controls allocation tracking via gpu_memory_allocated.
// Zero internal allocations — this is a thin validation + dispatch layer.
void g2_msm_unmanaged_wrapper_async(
// Unmanaged MSM wrapper for G2 (assumes all data is already on device)
// If points_in_montgomery is false, a temporary copy will be made and converted.
// For best performance, provide points already in Montgomery form to avoid allocation overhead.
// NOTE: This wrapper synchronizes the stream before returning — callers do not need to sync.
void g2_msm_unmanaged_wrapper(
cudaStream_t stream,
uint32_t gpu_index,
G2Projective* h_result,
G2Projective* d_result,
const G2Affine* d_points,
const Scalar* d_scalars,
uint32_t n,
G2Projective* d_scratch,
bool gpu_memory_allocated,
uint32_t n,
bool points_in_montgomery,
uint64_t* size_tracker
) {
PANIC_IF_FALSE(size_tracker != nullptr, "G2 MSM error: size_tracker is null");
uint64_t& size_tracker_ref = *size_tracker;
PANIC_IF_FALSE(n > 0, "G2 MSM error: n must be positive, got %u", n);
PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
PANIC_IF_FALSE(h_result != nullptr, "G2 MSM error: h_result is null");
PANIC_IF_FALSE(d_result != nullptr, "G2 MSM error: d_result is null");
PANIC_IF_FALSE(d_points != nullptr, "G2 MSM error: d_points is null");
PANIC_IF_FALSE(d_scalars != nullptr, "G2 MSM error: d_scalars is null");
PANIC_IF_FALSE(d_scratch != nullptr, "G2 MSM error: d_scratch is null");
PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
"G2 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
cuda_get_number_of_gpus());
point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
d_scratch, size_tracker_ref, gpu_memory_allocated);
const G2Affine* points_to_use = d_points;
G2Affine* d_points_converted = nullptr;
if (!points_in_montgomery) {
size_t points_bytes = 0;
bool overflow = __builtin_mul_overflow((size_t)n, sizeof(G2Affine), &points_bytes);
PANIC_IF_FALSE(!overflow,
"G2 MSM unmanaged error: points byte size overflow (n=%u)", n);
d_points_converted = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
PANIC_IF_FALSE(d_points_converted != nullptr, "G2 MSM error: failed to allocate memory for Montgomery conversion");
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(d_points_converted, d_points, points_bytes, stream, gpu_index, true);
convert_g2_points_to_montgomery(stream, gpu_index, d_points_converted, n);
check_cuda_error(cudaGetLastError());
points_to_use = d_points_converted;
}
point_msm_async_g2(stream, gpu_index, d_result, points_to_use, d_scalars, d_scratch, n, size_tracker_ref);
check_cuda_error(cudaGetLastError());
}
// Scratch size query wrappers (needed for bindgen `.*_wrapper` allowlist)
size_t pippenger_scratch_size_g1_wrapper(uint32_t n, uint32_t gpu_index) {
return pippenger_scratch_size_g1(n, gpu_index);
}
// Free temporary memory if allocated
if (d_points_converted != nullptr) {
cuda_drop_with_size_tracking_async(d_points_converted, stream, gpu_index, true);
}
size_t pippenger_scratch_size_g2_wrapper(uint32_t n, uint32_t gpu_index) {
return pippenger_scratch_size_g2(n, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
}
// Managed MSM wrapper for G1 (handles memory management internally)
@@ -154,48 +185,74 @@ void g1_msm_managed_wrapper(
PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
PANIC_IF_FALSE(points != nullptr, "G1 MSM error: points is null");
PANIC_IF_FALSE(scalars != nullptr, "G1 MSM error: scalars is null");
PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
"G1 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
cuda_get_number_of_gpus());
cuda_set_device(gpu_index);
/////////////////////////////////
// TODO: Move this check closer to the kernels
const auto threadsPerBlock = get_msm_threads_per_block<G1Affine>(n);
const auto num_blocks = CEIL_DIV(n, threadsPerBlock);
/////////////////////////////////
// Compute buffer sizes with overflow checking.
size_t points_bytes = safe_mul_sizeof<G1Affine>(static_cast<size_t>(n));
size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));
size_t scratch_elems = 0;
bool scratch_elems_overflow = __builtin_mul_overflow(
(size_t)(num_blocks + 1), (size_t)MSM_G1_BUCKET_COUNT, &scratch_elems);
PANIC_IF_FALSE(!scratch_elems_overflow,
"G1 MSM error: scratch element count overflow (num_blocks=%u)",
num_blocks);
size_t scratch_size = 0;
bool scratch_size_overflow =
__builtin_mul_overflow(scratch_elems, sizeof(G1Projective), &scratch_size);
PANIC_IF_FALSE(!scratch_size_overflow,
"G1 MSM error: scratch size overflow (scratch_elems=%zu)",
scratch_elems);
size_t points_bytes = 0;
bool points_bytes_overflow =
__builtin_mul_overflow((size_t)n, sizeof(G1Affine), &points_bytes);
PANIC_IF_FALSE(!points_bytes_overflow,
"G1 MSM error: points byte size overflow (n=%u)", n);
size_t scalars_bytes = 0;
bool scalars_bytes_overflow =
__builtin_mul_overflow((size_t)n, sizeof(Scalar), &scalars_bytes);
PANIC_IF_FALSE(!scalars_bytes_overflow,
"G1 MSM error: scalars byte size overflow (n=%u)", n);
// TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
auto* d_points = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
auto* d_result = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(sizeof(G1Projective), stream, gpu_index, size_tracker_ref, true));
auto* d_scratch = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(scratch_size, stream, gpu_index, size_tracker_ref, true));
PANIC_IF_FALSE(d_points && d_scalars && d_result && d_scratch,
"G1 MSM error: device memory allocation failed");
// Always copy points to GPU first
cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_scalars, scalars, scalars_bytes, stream, gpu_index, true);
// Convert to Montgomery form on GPU if not already in Montgomery form
if (!points_in_montgomery) {
convert_g1_points_to_montgomery(stream, gpu_index, d_points, n);
check_cuda_error(cudaGetLastError());
}
// Allocate scratch buffer sized to match the pippenger internal partitioning
size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
auto* d_scratch = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(
scratch_bytes, stream, gpu_index, size_tracker_ref, true));
PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
"G1 MSM error: device memory allocation failed");
// Result written directly to host pointer -- no device round-trip needed
point_msm_g1_async(stream, gpu_index, result, d_points, d_scalars, n,
d_scratch, size_tracker_ref, true);
point_msm_async_g1(stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n, size_tracker_ref);
check_cuda_error(cudaGetLastError());
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
cuda_memcpy_async_to_cpu(result, d_result, sizeof(G1Projective), stream, gpu_index);
cuda_drop_with_size_tracking_async(d_points, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scalars, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
// Sync for the async frees above.
cuda_synchronize_stream(stream, gpu_index);
}
@@ -218,46 +275,69 @@ void g2_msm_managed_wrapper(
PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
PANIC_IF_FALSE(points != nullptr, "G2 MSM error: points is null");
PANIC_IF_FALSE(scalars != nullptr, "G2 MSM error: scalars is null");
PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
"G2 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
cuda_get_number_of_gpus());
cuda_set_device(gpu_index);
const auto threadsPerBlock = get_msm_threads_per_block<G2Affine>(n);
const auto num_blocks = CEIL_DIV(n, threadsPerBlock);
// Compute buffer sizes with overflow checking.
size_t points_bytes = safe_mul_sizeof<G2Affine>(static_cast<size_t>(n));
size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));
size_t scratch_elems = 0;
bool scratch_elems_overflow = __builtin_mul_overflow(
(size_t)(num_blocks + 1), (size_t)MSM_G2_BUCKET_COUNT, &scratch_elems);
PANIC_IF_FALSE(!scratch_elems_overflow,
"G2 MSM error: scratch element count overflow (num_blocks=%u)",
num_blocks);
// TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
size_t scratch_size = 0;
bool scratch_size_overflow =
__builtin_mul_overflow(scratch_elems, sizeof(G2Projective), &scratch_size);
PANIC_IF_FALSE(!scratch_size_overflow,
"G2 MSM error: scratch size overflow (scratch_elems=%zu)",
scratch_elems);
size_t points_bytes = 0;
bool points_bytes_overflow =
__builtin_mul_overflow((size_t)n, sizeof(G2Affine), &points_bytes);
PANIC_IF_FALSE(!points_bytes_overflow,
"G2 MSM error: points byte size overflow (n=%u)", n);
size_t scalars_bytes = 0;
bool scalars_bytes_overflow =
__builtin_mul_overflow((size_t)n, sizeof(Scalar), &scalars_bytes);
PANIC_IF_FALSE(!scalars_bytes_overflow,
"G2 MSM error: scalars byte size overflow (n=%u)", n);
auto* d_points = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
auto* d_result = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(sizeof(G2Projective), stream, gpu_index, size_tracker_ref, true));
auto* d_scratch = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(scratch_size, stream, gpu_index, size_tracker_ref, true));
PANIC_IF_FALSE(d_points && d_scalars && d_result && d_scratch,
"G2 MSM error: device memory allocation failed");
cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
cuda_memcpy_with_size_tracking_async_to_gpu(d_scalars, scalars, scalars_bytes, stream, gpu_index, true);
if (!points_in_montgomery) {
convert_g2_points_to_montgomery(stream, gpu_index, d_points, n);
check_cuda_error(cudaGetLastError());
}
// Allocate scratch buffer sized to match the pippenger internal partitioning
size_t scratch_bytes = pippenger_scratch_size_g2(n, gpu_index);
auto* d_scratch = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(
scratch_bytes, stream, gpu_index, size_tracker_ref, true));
PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
"G2 MSM error: device memory allocation failed");
// Result written directly to host pointer -- no device round-trip needed
point_msm_g2_async(stream, gpu_index, result, d_points, d_scalars, n,
d_scratch, size_tracker_ref, true);
point_msm_async_g2(stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n, size_tracker_ref);
check_cuda_error(cudaGetLastError());
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
cuda_memcpy_async_to_cpu(result, d_result, sizeof(G2Projective), stream, gpu_index);
cuda_drop_with_size_tracking_async(d_points, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scalars, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
// Sync for the async frees above.
cuda_synchronize_stream(stream, gpu_index);
}

View File

@@ -2,7 +2,6 @@
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
@@ -99,38 +98,33 @@ bool g1_is_infinity_wrapper(const G1Point* point);
// Check if G2 point is at infinity
bool g2_is_infinity_wrapper(const G2Point* point);
// Unmanaged MSM wrappers (points/scalars/scratch on device, result on host)
// Points MUST be in Montgomery form. Caller provides a scratch buffer.
// Zero internal allocations — all device memory is caller-provided.
void g1_msm_unmanaged_wrapper_async(
// Unmanaged MSM wrappers (assumes all data is already on device)
// If points_in_montgomery is false, a temporary copy will be made and converted.
// For best performance, provide points already in Montgomery form to avoid allocation overhead.
void g1_msm_unmanaged_wrapper(
cudaStream_t stream,
uint32_t gpu_index,
G1ProjectivePoint* h_result,
G1ProjectivePoint* d_result,
const G1Point* d_points,
const Scalar* d_scalars,
uint32_t n,
G1ProjectivePoint* d_scratch,
bool gpu_memory_allocated,
uint32_t n,
bool points_in_montgomery,
uint64_t* size_tracker
);
void g2_msm_unmanaged_wrapper_async(
void g2_msm_unmanaged_wrapper(
cudaStream_t stream,
uint32_t gpu_index,
G2ProjectivePoint* h_result,
G2ProjectivePoint* d_result,
const G2Point* d_points,
const Scalar* d_scalars,
uint32_t n,
G2ProjectivePoint* d_scratch,
bool gpu_memory_allocated,
uint32_t n,
bool points_in_montgomery,
uint64_t* size_tracker
);
// Scratch size queries for Pippenger MSM
// Returns the exact scratch buffer size in bytes needed for a given input count.
size_t pippenger_scratch_size_g1_wrapper(uint32_t n, uint32_t gpu_index);
size_t pippenger_scratch_size_g2_wrapper(uint32_t n, uint32_t gpu_index);
// Managed MSM wrappers with BigInt scalars (320-bit scalars)
// Handles memory allocation and transfers internally.
void g1_msm_managed_wrapper(

View File

@@ -84,8 +84,6 @@ impl G1Affine {
}
}
/// Displays coordinates in decimal. Assumes the point is in Montgomery form (e.g., from MSM
/// output).
impl fmt::Display for G1Affine {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.is_infinity() {
@@ -266,9 +264,8 @@ impl G1Projective {
let mut size_tracker: u64 = 0;
// NOTE: This method uses the managed API (g1_msm_managed_wrapper) which handles
// memory allocation and transfers internally. For a pure-GPU verify/proof implementation
// where all data is already on the device and memory is managed externally, use the
// unmanaged API (g1_msm_unmanaged_wrapper_async) instead — it performs zero internal
// allocations (caller provides d_scratch via pippenger_scratch_size_g1_wrapper).
// where all data is already on the device and memory is managed externally, consider
// using the unmanaged API (g1_msm_unmanaged_wrapper) instead for better performance.
//
// SAFETY:
// - `stream` was validated as non-null above and must be a valid `cudaStream_t` obtained
@@ -281,10 +278,6 @@ impl G1Projective {
// - `points_ffi` and `scalars_ffi` are valid Vec slices with matching length `n`
// - `result` and `size_tracker` are valid stack-allocated outputs
// - The managed wrapper handles all device memory allocation/deallocation internally
// - Failure: The C++ managed wrapper validates all inputs via PANIC_IF_FALSE and checks
// CUDA errors via cudaGetLastError() after each kernel launch.
// - Success: The C++ managed wrapper calls cuda_synchronize_stream before returning,
// ensuring `result` contains the final MSM output.
unsafe {
crate::bindings::g1_msm_managed_wrapper(
stream as crate::bindings::cudaStream_t,
@@ -302,7 +295,6 @@ impl G1Projective {
}
}
/// Converts to affine and displays. Assumes coordinates are in Montgomery form.
impl fmt::Display for G1Projective {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let affine = self.to_affine();

View File

@@ -84,8 +84,6 @@ impl G2Affine {
}
}
/// Displays coordinates in decimal. Assumes the point is in Montgomery form (e.g., from MSM
/// output).
impl fmt::Display for G2Affine {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.is_infinity() {
@@ -274,9 +272,8 @@ impl G2Projective {
let mut size_tracker: u64 = 0;
// NOTE: This method uses the managed API (g2_msm_managed_wrapper) which handles
// memory allocation and transfers internally. For a pure-GPU verify/proof implementation
// where all data is already on the device and memory is managed externally, use the
// unmanaged API (g2_msm_unmanaged_wrapper_async) instead — it performs zero internal
// allocations (caller provides d_scratch via pippenger_scratch_size_g2_wrapper).
// where all data is already on the device and memory is managed externally, consider
// using the unmanaged API (g2_msm_unmanaged_wrapper) instead for better performance.
//
// SAFETY:
// - `stream` was validated as non-null above and must be a valid `cudaStream_t` obtained
@@ -289,10 +286,6 @@ impl G2Projective {
// - `points_ffi` and `scalars_ffi` are valid Vec slices with matching length `n`
// - `result` and `size_tracker` are valid stack-allocated outputs
// - The managed wrapper handles all device memory allocation/deallocation internally
// - Failure: The C++ managed wrapper validates all inputs via PANIC_IF_FALSE and checks
// CUDA errors via cudaGetLastError() after each kernel launch.
// - Success: The C++ managed wrapper calls cuda_synchronize_stream before returning,
// ensuring `result` contains the final MSM output.
unsafe {
crate::bindings::g2_msm_managed_wrapper(
stream as crate::bindings::cudaStream_t,
@@ -310,7 +303,6 @@ impl G2Projective {
}
}
/// Converts to affine and displays. Assumes coordinates are in Montgomery form.
impl fmt::Display for G2Projective {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let affine = self.to_affine();

View File

@@ -34,7 +34,6 @@ class Layer(enum.StrEnum):
Integer = "integer"
Shortint = "shortint"
CoreCrypto = "core_crypto"
Wasm = "wasm"
@staticmethod
def from_str(layer_name):
@@ -47,8 +46,6 @@ class Layer(enum.StrEnum):
return Layer.Shortint
case "core_crypto":
return Layer.CoreCrypto
case "wasm":
return Layer.Wasm
case _:
raise NotImplementedError(f"layer '{layer_name}' not supported")
@@ -303,7 +300,6 @@ class BenchType(enum.StrEnum):
class BenchSubset(enum.StrEnum):
All = "all"
Erc20 = "erc20"
Zk = "zk"
@staticmethod
def from_str(bench_subset):
@@ -312,52 +308,10 @@ class BenchSubset(enum.StrEnum):
return BenchSubset.All
case "erc20":
return BenchSubset.Erc20
case "zk":
return BenchSubset.Zk
case _:
raise ValueError(f"BenchSubset '{bench_subset}' not supported")
class ZKOperation(enum.StrEnum):
"""
Operations names mapped to their display in the public documentation.
"""
Proof = "Proving"
Verify = "Verifying"
VerifyAndExpand = "Verify + expand"
@staticmethod
def from_str(op_name):
match op_name.lower().rsplit("pke_zk_")[-1]:
case "proof":
return ZKOperation.Proof
case "verify":
return ZKOperation.Verify
case "verify_and_expand":
return ZKOperation.VerifyAndExpand
case _:
raise ValueError(f"ZK operation '{op_name}' not supported")
class ZKComputeLoad(enum.StrEnum):
Proof = "slow proof / fast verify"
Verify = "fast proof / slow verify"
@staticmethod
def from_str(load):
match load.lower():
case "proof":
return ZKComputeLoad.Proof
case "verify":
return ZKComputeLoad.Verify
case _:
raise ValueError(f"ZK compute load '{load}' not supported")
def fs_safe_str(self):
return self.value.replace(" ", "_").replace("/", "and")
class ParamsDefinition:
"""
Represents a parameter definition for specific cryptographic settings.
@@ -515,7 +469,6 @@ class BenchDetails:
def __init__(self, layer: Layer, bench_full_name: str, bit_size: int):
self.layer = layer
self.bench_type = BenchType.Latency
self.operation_name = None
self.bit_size = bit_size
self.params = None
@@ -523,12 +476,11 @@ class BenchDetails:
self.sign_flavor = None
# Only relevant for HLApi layer
self.rust_type = None
self.case_variation = None
self.parse_test_name(bench_full_name)
def __repr__(self):
return f"BenchDetails(layer={self.layer.value}, type={self.bench_type}, operation_name={self.operation_name}, bit_size={self.bit_size}, params={self.params}, sign={self.sign_flavor or 'N/A'}, case={self.case_variation or 'N/A'})"
return f"BenchDetails(layer={self.layer.value}, operation_name={self.operation_name}, bit_size={self.bit_size}, params={self.params}, {self.sign_flavor})"
def __str__(self):
return self.__repr__()
@@ -536,26 +488,22 @@ class BenchDetails:
def __eq__(self, other):
return (
self.layer == other.layer
and self.bench_type == other.bench_type
and self.operation_name == other.operation_name
and self.bit_size == other.bit_size
and self.params == other.params
and self.sign_flavor == other.sign_flavor
and self.rust_type == other.rust_type
and self.case_variation == other.case_variation
)
def __hash__(self):
return hash(
(
self.layer,
self.bench_type,
self.operation_name,
self.bit_size,
self.params,
self.rust_type,
self.sign_flavor,
self.case_variation,
)
)
@@ -570,9 +518,6 @@ class BenchDetails:
"""
parts = name.split("::")
if "throughput" in parts:
self.bench_type = BenchType.Throughput
for part in parts:
if "PARAM" in part:
self.params = part.partition("_mean")[0]
@@ -580,11 +525,7 @@ class BenchDetails:
match self.layer:
case Layer.Integer:
op_name_index = 2 if parts[1] in ["cuda", "hpu", "zk"] else 1
if self.params and not parts[-1].startswith(self.params):
self.case_variation = parts[-1].partition("_mean")[0]
op_name_index = 2 if parts[1] in ["cuda", "hpu"] else 1
if parts[op_name_index] == "signed":
op_name_index += 1
self.sign_flavor = SignFlavor.Signed
@@ -623,12 +564,6 @@ class BenchDetails:
self.rust_type = parts[-1].partition("_mean")[0]
case Layer.Shortint:
self.operation_name = parts[1]
case Layer.Wasm:
op_name_index = 2 if parts[1] in ["cuda", "hpu", "zk"] else 1
self.operation_name = parts[op_name_index]
if self.params and not parts[-1].startswith(self.params):
self.case_variation = parts[-1].partition("_mean")[0]
case _:
raise NotImplementedError(
f"layer '{self.layer}' not supported yet for name parsing"

View File

@@ -35,8 +35,6 @@ class UserConfig:
self.bench_subset = BenchSubset.from_str(input_args.bench_subset)
self.name_suffix = input_args.name_suffix
self.layer = Layer.from_str(input_args.layer.lower())
self.pbs_kind = PBSKind.from_str(input_args.pbs_kind)
self.grouping_factor = input_args.grouping_factor

View File

@@ -132,7 +132,7 @@ class PostgreConnector:
operation_filter: list = None,
layer: Layer = None,
branch: str = None,
name_suffix: str = None,
name_suffix: str = "_mean_avx512",
last_value_only: bool = True,
) -> dict[BenchDetails, list[int]]:
"""
@@ -155,7 +155,7 @@ class PostgreConnector:
:type layer: Layer, optional
:param branch: Optional branch filter, defaulting to the user's head branch if not specified.
:type branch: str, optional
:param name_suffix: Suffix to match the test names.
:param name_suffix: Suffix to match the test names, defaulting to "_mean_avx512".
:type name_suffix: str, optional
:param last_value_only: A flag indicating whether to fetch only the most recent metric value for each benchmark.
:type last_value_only: bool
@@ -169,7 +169,6 @@ class PostgreConnector:
layer = layer if layer else user_config.layer
version = user_config.project_version
pbs_kind = user_config.pbs_kind
name_suffix = name_suffix if name_suffix else user_config.name_suffix
timestamp_range_end = user_config.bench_date
timestamp = datetime.datetime.fromisoformat(timestamp_range_end)

View File

@@ -24,7 +24,6 @@ import connector
import formatters.core
import formatters.hlapi
import formatters.integer
import formatters.wasm
import regression
from benchmark_specs import BenchSubset, BenchType, Layer, OperandType, RustType
from formatters.common import BenchArray, CSVFormatter, MarkdownFormatter, SVGFormatter
@@ -137,16 +136,13 @@ parser.add_argument(
parser.add_argument(
"--bench-subset",
dest="bench_subset",
choices=["all", "erc20", "zk"],
choices=[
"all",
"erc20",
],
default="all",
help="Subset of benchmarks to filter against, dedicated formatting will be applied",
)
parser.add_argument(
"--name-suffix",
dest="name_suffix",
default="_mean_avx512",
help="Suffix to match the test names",
)
parser.add_argument(
"--regression-profiles",
dest="regression_profiles",
@@ -287,11 +283,6 @@ def get_formatter(layer: Layer, bench_subset: BenchSubset):
match bench_subset:
case BenchSubset.Erc20:
return formatters.hlapi.Erc20Formatter
case BenchSubset.Zk:
if layer == Layer.Wasm:
return formatters.wasm.ZKFormatter
else:
return formatters.integer.ZKFormatter
match layer:
case Layer.Integer:
@@ -434,26 +425,6 @@ def generate_files_from_arrays(
)
def get_operands_types(layer: Layer, bench_subset: BenchSubset = None):
ciphertext_only = (OperandType.CipherText,)
ciphertext_and_plaintext = (OperandType.CipherText, OperandType.PlainText)
if layer == Layer.CoreCrypto:
return ciphertext_only
elif bench_subset:
match bench_subset:
case BenchSubset.Zk | BenchSubset.Erc20:
return ciphertext_only
case BenchSubset.All:
return ciphertext_and_plaintext
case _:
raise NotImplementedError(
f"operand types cannot be defined for bench subset '{bench_subset}'"
)
else:
return ciphertext_and_plaintext
if __name__ == "__main__":
args = parser.parse_args()
user_config = config.UserConfig(args)
@@ -501,9 +472,7 @@ if __name__ == "__main__":
args.hardware_comp.lower().split(",") if args.hardware_comp else None
)
operands_types = get_operands_types(layer, bench_subset)
for operand_type in operands_types:
for operand_type in (OperandType.CipherText, OperandType.PlainText):
if hardware_list:
perform_hardware_comparison(user_config, layer)
@@ -511,6 +480,11 @@ if __name__ == "__main__":
print("Markdown generation is not supported with comparisons")
continue
if (
layer == Layer.CoreCrypto or (layer == Layer.HLApi and bench_subset)
) and operand_type == OperandType.PlainText:
continue
file_suffix = f"_{operand_type.lower()}"
arrays = perform_data_extraction(
user_config,

View File

@@ -15,8 +15,6 @@ from benchmark_specs import (
OperandType,
PBSKind,
RustType,
ZKComputeLoad,
ZKOperation,
)
from py_markdown_table.markdown_table import markdown_table
@@ -309,109 +307,6 @@ class MarkdownFormatter(GenericFormatter):
return md_array
class ZKGenericFormatter(GenericFormatter):
INPUTS_PROOF_COLUMN_HEADERS = f"Inputs ({ZKComputeLoad.Proof.value})"
INPUTS_VERIFY_COLUMN_HEADERS = f"Inputs ({ZKComputeLoad.Verify.value})"
DEFAULT_CRS_SIZE = 2048
@staticmethod
def _get_default_dict() -> collections.defaultdict:
raise NotImplementedError("This method must be implemented by subclasses")
@staticmethod
def _match_case_variation_filter(case_variation: dict):
raise NotImplementedError("This method must be implemented by subclasses")
def _format_data(self, data: dict[BenchDetails : list[int]], conversion_func):
formatted = self._get_default_dict()
for details, timings in data.items():
parsed_case_variation = self._parse_benchmarks_case_variation(
details.case_variation
)
if not (
(parsed_case_variation["crs_size"] == self.DEFAULT_CRS_SIZE)
and self._match_case_variation_filter(parsed_case_variation)
):
continue
test_name = "::".join(
[
parsed_case_variation["compute_load"],
str(parsed_case_variation["packed_size"]),
str(parsed_case_variation["crs_size"]),
]
)
value = conversion_func(timings[-1])
formatted[test_name][ZKOperation.from_str(details.operation_name)] = value
return formatted
@staticmethod
def _parse_benchmarks_case_variation(case_variation: str):
parts = case_variation.split("_")
return {
"packed_size": int(parts[0]),
"crs_size": int(parts[3]),
"compute_load": parts[8],
}
def _generate_arrays(self, data, *args, **kwargs):
# Sorted as they appear in the public documentation.
input_names = {
64: "1xFheUint64 (64 bits)",
256: "4xFheUint64 (256 bits) ",
2048: "32xFheUint64 (2048 bits)",
}
sorted_with_compute_load = {
ZKComputeLoad.Proof: {},
ZKComputeLoad.Verify: {},
}
result_lines_compute_load_proof = []
result_lines_compute_load_verify = []
for key in data:
compute_load, packed_bits, _ = key.split("::")
packed_bits = int(packed_bits)
if packed_bits not in input_names:
continue
sorted_with_compute_load[ZKComputeLoad.from_str(compute_load)][
packed_bits
] = data[key]
for load, results in sorted_with_compute_load.items():
if load == ZKComputeLoad.Proof:
table = result_lines_compute_load_proof
header = self.INPUTS_PROOF_COLUMN_HEADERS
elif load == ZKComputeLoad.Verify:
table = result_lines_compute_load_verify
header = self.INPUTS_VERIFY_COLUMN_HEADERS
# The following loop ensures display consistency between inputs
for packed_bits, input_name in input_names.items():
line = {header: input_name}
line.update({op.value: v for op, v in results[packed_bits].items()})
table.append(line)
return [
BenchArray(
result_lines_compute_load_proof,
self.layer,
metadata={"compute_load": ZKComputeLoad.Proof.fs_safe_str()},
),
BenchArray(
result_lines_compute_load_verify,
self.layer,
metadata={"compute_load": ZKComputeLoad.Verify.fs_safe_str()},
),
]
# -------------
# SVG constants
# -------------
@@ -512,7 +407,7 @@ class SVGFormatter(GenericFormatter):
)
else: # Backends comparison (CPU, GPU, HPU)
header_elements.append(header_one_row_span)
case Layer.HLApi | Layer.CoreCrypto | Layer.Wasm:
case Layer.HLApi | Layer.CoreCrypto:
# Core_crypto arrays contains only ciphertext modulus size as headers
header_elements.append(header_one_row_span)
case _:

View File

@@ -5,17 +5,10 @@ from benchmark_specs import (
ALL_RUST_INTEGER_TYPES,
Backend,
BenchDetails,
BenchType,
OperandType,
RustType,
ZKOperation,
)
from formatters.common import (
OPERATION_SIZE_COLUMN_HEADER,
BenchArray,
GenericFormatter,
ZKGenericFormatter,
)
from formatters.common import OPERATION_SIZE_COLUMN_HEADER, BenchArray, GenericFormatter
class OperationDisplayName(enum.StrEnum):
@@ -240,21 +233,3 @@ class IntegerFormatter(GenericFormatter):
return [
BenchArray(result_lines, self.layer),
]
class ZKFormatter(ZKGenericFormatter):
@staticmethod
def _get_default_dict() -> collections.defaultdict:
return collections.defaultdict(
lambda: {
ZKOperation.Proof: "N/A",
ZKOperation.Verify: "N/A",
ZKOperation.VerifyAndExpand: "N/A",
}
)
@staticmethod
def _match_case_variation_filter(*args, **kwargs):
# At this layer, server-like ZK are performed there are no variations such as browser kind.
# Simply match all cases.
return True

View File

@@ -1 +0,0 @@
from .wasm import *

View File

@@ -1,82 +0,0 @@
import collections
import enum
from benchmark_specs import ZKOperation
from formatters.common import ZKGenericFormatter
class Browser(enum.StrEnum):
Chrome = "chrome"
Firefox = "firefox"
@staticmethod
def from_str(browser_name):
match browser_name.lower():
case "chrome":
return Browser.Chrome
case "firefox":
return Browser.Firefox
case _:
raise ValueError(f"Browser '{browser_name}' not supported")
DEFAULT_BROWSER = Browser.Chrome
class ZKFormatter(ZKGenericFormatter):
@staticmethod
def _get_default_dict() -> collections.defaultdict:
return collections.defaultdict(
lambda: {
ZKOperation.Proof: "N/A",
}
)
@staticmethod
def _parse_benchmarks_case_variation(case_variation: str):
parts = case_variation.split("_")
case = {
"packed_size": int(parts[0]),
"crs_size": int(parts[3]),
"compute_load": parts[8],
"sub_variation": {},
}
try:
sub_variation_parts = parts[9:]
except IndexError:
# No sub variation for this case
return case
try:
browser = Browser.from_str(sub_variation_parts[-1])
sub_variation_parts.pop()
except ValueError:
browser = None
version = None
if sub_variation_parts[0].lower().startswith("zkv"):
version = sub_variation_parts.pop(0)
details = sub_variation_parts[:]
case["sub_variation"] = {
"version": version,
"browser": browser,
"details": details,
}
return case
@staticmethod
def _match_case_variation_filter(case_variation: dict):
sub_variation = case_variation["sub_variation"]
try:
# No details must be specified, otherwise it could mean that a ciphertext
# size measurement or a non-threaded benchmark case.
return (
sub_variation["browser"] == DEFAULT_BROWSER
and sub_variation["details"] == []
)
except KeyError:
# At least we must have a browser specified.
return False

View File

@@ -25,7 +25,7 @@ user = "ubuntu"
# Profile used to build CUDA code without the need to get p-like instance.
[backend.aws.gpu-build]
region = "us-east-1"
image_id = "ami-093b80553736c78e3"
image_id = "ami-06a04649d895d10e0"
instance_type = "m6i.4xlarge"
user = "ubuntu"

View File

@@ -54,13 +54,13 @@ RUST_CALL_SITES = [
# Bindings parsed from bindings.rs
# Scratch functions: Two more than cleanup functions because of
# 'scratch_cuda_programmable_bootstrap_32_async' and
EXPECTED_SCRATCH_COUNT = 70
EXPECTED_SCRATCH_COUNT = 71
# Cuda operation functions
EXPECTED_CUDA_COUNT = 107
EXPECTED_CUDA_COUNT = 109
# Cleanup functions
EXPECTED_CLEANUP_COUNT = 70
EXPECTED_CLEANUP_COUNT = 71
# Check 3: Rust call-site scanning
# Number of functions in ffi.rs files

View File

@@ -1,4 +1,4 @@
#! /usr/bin/env bash
#! /usr/bin/env/ bash
# Find current script directory. This should be PROJECT_DIR
CUR_SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
@@ -15,11 +15,7 @@ HPU_CONFIG="sim"
RUST_LOG="info"
# Setting PCI device variable: depends on the machine
if command -v lscpi &> /dev/null; then
mapfile -t DEVICE< <(lspci -d 10ee:50b5)
else
DEVICE=()
fi
mapfile -t DEVICE< <(lspci -d 10ee:50b5)
V80_PCIE_DEV="unselected"
# V80 bitstream refresh rely on XilinxVivado tools
@@ -33,7 +29,7 @@ opt_short="hc:l:p:"
opt_long="help,config:,rust-log:pcie-dev"
OPTS=$(getopt -o "$opt_short" -l "$opt_long" -- "$@")
while [ $# -gt 0 ]
while true
do
case "$1" in
-h|--help)
@@ -63,7 +59,7 @@ do
shift 2
;;
-p|--pcie-dev)
if [ -n "${2:-}" ] && [[ ! ${2:-} =~ ^- ]]; then
if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then
V80_PCIE_DEV="${2}"
((i++))
shift 1

View File

@@ -3,7 +3,9 @@ use crate::{load_and_unversionize, TestedModule};
use std::path::Path;
#[cfg(feature = "zk-pok")]
use tfhe::integer::parameters::DynamicDistribution;
use tfhe::prelude::*;
use tfhe::prelude::{
CiphertextList, FheDecrypt, FheEncrypt, ParameterSetConformant, ReRandomize, SquashNoise,
};
#[cfg(feature = "zk-pok")]
use tfhe::shortint::parameters::{
CompactCiphertextListExpansionKind, CompactPublicKeyEncryptionParameters,
@@ -11,29 +13,27 @@ use tfhe::shortint::parameters::{
#[cfg(feature = "zk-pok")]
use tfhe::shortint::prelude::LweDimension;
use tfhe::shortint::{CarryModulus, CiphertextModulus, MessageModulus};
use tfhe::xof_key_set::CompressedXofKeySet;
#[cfg(feature = "zk-pok")]
use tfhe::zk::{CompactPkeCrs, CompactPkeCrsConformanceParams};
#[cfg(feature = "zk-pok")]
use tfhe::ProvenCompactCiphertextList;
use tfhe::{
set_server_key, ClientKey, CompactCiphertextList, CompactCiphertextListBuilder,
CompactPublicKey, CompressedCiphertextList, CompressedCiphertextListBuilder,
set_server_key, ClientKey, CompactCiphertextList, CompressedCiphertextList,
CompressedCompactPublicKey, CompressedFheBool, CompressedFheInt8, CompressedFheUint8,
CompressedKVStore, CompressedPublicKey, CompressedServerKey,
CompressedSquashedNoiseCiphertextList, CompressedSquashedNoiseCiphertextListBuilder, FheBool,
FheInt8, FheUint32, FheUint64, FheUint8, ReRandomizationContext, ServerKey,
SquashedNoiseFheBool, SquashedNoiseFheInt, SquashedNoiseFheUint,
CompressedSquashedNoiseCiphertextList, FheBool, FheInt8, FheUint64, FheUint8,
ReRandomizationContext, ServerKey, SquashedNoiseFheBool, SquashedNoiseFheInt,
SquashedNoiseFheUint,
};
#[cfg(feature = "zk-pok")]
use tfhe::{CompactPublicKey, ProvenCompactCiphertextList};
use tfhe_backward_compat_data::load::{
load_versioned_auxiliary, DataFormat, TestFailure, TestResult, TestSuccess,
};
use tfhe_backward_compat_data::{
DataKind, HlBoolCiphertextTest, HlCiphertextTest, HlClientKeyTest, HlCompressedKVStoreTest,
HlCompressedSquashedNoiseCiphertextListTest, HlCompressedXofKeySetTest,
HlHeterogeneousCiphertextListTest, HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest,
HlSquashedNoiseBoolCiphertextTest, HlSquashedNoiseSignedCiphertextTest,
HlSquashedNoiseUnsignedCiphertextTest, TestMetadata, TestType, Testcase, ZkPkePublicParamsTest,
HlCompressedSquashedNoiseCiphertextListTest, HlHeterogeneousCiphertextListTest,
HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest, HlSquashedNoiseBoolCiphertextTest,
HlSquashedNoiseSignedCiphertextTest, HlSquashedNoiseUnsignedCiphertextTest, TestMetadata,
TestType, Testcase, ZkPkePublicParamsTest,
};
use tfhe_versionable::Unversionize;
@@ -360,155 +360,6 @@ pub fn test_hl_pubkey(
}
}
/// Shared feature-testing logic for server keys: computation, re-randomization, noise squashing,
/// compression, and compressed noise-squashed lists.
fn test_hl_key_features(
client_key: &ClientKey,
server_key: ServerKey,
compact_public_key: Option<&CompactPublicKey>,
test: &impl TestType,
format: DataFormat,
) -> Result<(), TestFailure> {
set_server_key(server_key.clone());
let clear_a = 278120u32;
let clear_b = 839412u32;
let (mut a, mut b) = match compact_public_key {
Some(pk) => {
let compact_list = CompactCiphertextListBuilder::new(pk)
.push(clear_a)
.push(clear_b)
.build_packed();
let expanded = compact_list
.expand()
.map_err(|e| test.failure(format!("Failed to expand: {e}"), format))?;
let a: FheUint32 = expanded.get(0).unwrap().unwrap();
let b: FheUint32 = expanded.get(1).unwrap().unwrap();
(a, b)
}
None => {
let a = FheUint32::encrypt(clear_a, client_key);
let b = FheUint32::encrypt(clear_b, client_key);
(a, b)
}
};
// Re-randomization
if let (Some(pk), true) = (
compact_public_key,
server_key.supports_ciphertext_re_randomization(),
) {
let nonce: [u8; 256 / 8] = core::array::from_fn(|i| i as u8);
let mut re_rand_context = ReRandomizationContext::new(
*b"TFHE_Rrd",
[b"FheUint32 bin ops".as_slice(), nonce.as_slice()],
*b"TFHE_Enc",
);
re_rand_context.add_ciphertext(&a);
re_rand_context.add_ciphertext(&b);
let mut seed_gen = re_rand_context.finalize();
a.re_randomize(pk, seed_gen.next_seed().unwrap())
.map_err(|e| test.failure(format!("Failed to re-randomize a: {e}"), format))?;
b.re_randomize(pk, seed_gen.next_seed().unwrap())
.map_err(|e| test.failure(format!("Failed to re-randomize b: {e}"), format))?;
}
// Computation
let c = &a + &b;
let d = &a & &b;
let expected_c = clear_a.wrapping_add(clear_b);
let expected_d = clear_a & clear_b;
for (val, expected) in [&c, &d].iter().zip([expected_c, expected_d]) {
let dec: u32 = val.decrypt(client_key);
if dec != expected {
return Err(test.failure(
format!("Invalid decryption: expected {expected}, got {dec}"),
format,
));
}
}
// Noise squashing
if server_key.supports_noise_squashing() {
let ns_c = c
.squash_noise()
.map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
let ns_d = d
.squash_noise()
.map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
for (ns_val, expected) in [&ns_c, &ns_d].iter().zip([expected_c, expected_d]) {
let dec: u32 = ns_val.decrypt(client_key);
if dec != expected {
return Err(test.failure(
format!("Invalid noise-squashed decryption: expected {expected}, got {dec}"),
format,
));
}
}
if server_key.supports_noise_squashing_compression() {
// Compressed noise-squashed ciphertext list
let ns_compressed_list = CompressedSquashedNoiseCiphertextListBuilder::new()
.push(ns_c)
.push(ns_d)
.build()
.map_err(|e| {
test.failure(
format!("Failed to build compressed squashed noise list: {e}"),
format,
)
})?;
for (i, expected) in [expected_c, expected_d].iter().enumerate() {
let val: SquashedNoiseFheUint = ns_compressed_list.get(i).unwrap().unwrap();
let dec: u32 = val.decrypt(client_key);
if dec != *expected {
return Err(test.failure(
format!(
"Invalid compressed noise-squashed[{i}]: \
expected {expected}, got {dec}"
),
format,
));
}
}
}
}
// Compression / decompression
if server_key.supports_compression() {
let compressed_list = CompressedCiphertextListBuilder::new()
.push(a)
.push(b)
.push(c)
.push(d)
.build()
.map_err(|e| test.failure(format!("Failed to build compressed list: {e}"), format))?;
let expected_values = [clear_a, clear_b, expected_c, expected_d];
for (i, expected) in expected_values.iter().enumerate() {
let val: FheUint32 = compressed_list.get(i).unwrap().unwrap();
let dec: u32 = val.decrypt(client_key);
if dec != *expected {
return Err(test.failure(
format!("Invalid decompressed[{i}]: expected {expected}, got {dec}"),
format,
));
}
}
}
Ok(())
}
/// Test HL server key: encrypt two values with a client key, add them using the server key and
/// check that the decrypted sum is valid.
pub fn test_hl_serverkey(
@@ -522,6 +373,11 @@ pub fn test_hl_serverkey(
)
.map_err(|e| test.failure(e, format))?;
let v1 = 73u8;
let mut ct1 = FheUint8::encrypt(v1, &client_key);
let v2 = 102u8;
let ct2 = FheUint8::encrypt(v2, &client_key);
let key = if test.compressed {
let compressed: CompressedServerKey = load_and_unversionize(dir, test, format)?;
compressed.decompress()
@@ -529,20 +385,77 @@ pub fn test_hl_serverkey(
load_and_unversionize(dir, test, format)?
};
let compact_public_key = test
.rerand_cpk_filename
.as_ref()
.map(|filename| {
let cpk_file = dir.join(filename.to_string());
CompressedCompactPublicKey::unversionize(
load_versioned_auxiliary(cpk_file).map_err(|e| test.failure(e, format))?,
)
.map_err(|e| test.failure(e, format))
.map(|cpk| cpk.decompress())
})
.transpose()?;
let has_noise_squashing = key.supports_noise_squashing();
let has_rerand = key.supports_ciphertext_re_randomization();
set_server_key(key);
test_hl_key_features(&client_key, key, compact_public_key.as_ref(), test, format)?;
if has_noise_squashing {
let ns = ct1.squash_noise().unwrap();
let res: u8 = ns.decrypt(&client_key);
if res != v1 {
return Err(test.failure(
format!(
"Invalid result for noise squashing using loaded server key, expected {v1} got {res}",
),
format,
));
}
}
if let Some(rerand_cpk_filename) = test.rerand_cpk_filename.as_ref() {
if has_rerand {
let rerand_cpk_file = dir.join(rerand_cpk_filename.to_string());
let public_key = CompressedCompactPublicKey::unversionize(
load_versioned_auxiliary(rerand_cpk_file).map_err(|e| test.failure(e, format))?,
)
.map_err(|e| test.failure(e, format))?
.decompress();
let nonce: [u8; 256 / 8] = rand::random();
let mut re_rand_context = ReRandomizationContext::new(
*b"TFHE_Rrd",
[b"FheUint8".as_slice(), nonce.as_slice()],
*b"TFHE_Enc",
);
re_rand_context.add_ciphertext(&ct1);
let mut seed_gen = re_rand_context.finalize();
ct1.re_randomize(&public_key, seed_gen.next_seed().unwrap())
.unwrap();
#[allow(clippy::eq_op)]
let rrd = &ct1 & &ct1;
let res: u8 = rrd.decrypt(&client_key);
if res != v1 {
return Err(test.failure(
format!(
"Invalid result for rerand using loaded server key, expected {v1} got {res}",
),
format,
));
}
} else {
return Err(test.failure(
"Test requires rerand key but server key does not have it".to_string(),
format,
));
}
}
let ct_sum = ct1 + ct2;
let sum: u8 = ct_sum.decrypt(&client_key);
if sum != v1 + v2 {
return Err(test.failure(
format!(
"Invalid result for addition using loaded server key, expected {} got {}",
v1 + v2,
sum,
),
format,
));
}
Ok(test.success(format))
}
@@ -746,39 +659,6 @@ fn test_hl_compressed_kv_store_test(
Ok(test.success(format))
}
fn test_hl_compressed_xof_key_set_test(
dir: &Path,
test: &HlCompressedXofKeySetTest,
format: DataFormat,
) -> Result<TestSuccess, TestFailure> {
let client_key_file = dir.join(&*test.client_key_file_name);
let client_key = ClientKey::unversionize(
load_versioned_auxiliary(client_key_file).map_err(|e| test.failure(e, format))?,
)
.map_err(|e| test.failure(format!("Failed to load client key file: {e}"), format))?;
let compressed_xof_key_set_file = dir.join(&*test.compressed_xof_key_set_file_name);
let compressed_xof_key_set = CompressedXofKeySet::unversionize(
load_versioned_auxiliary(compressed_xof_key_set_file)
.map_err(|e| test.failure(e, format))?,
)
.map_err(|e| {
test.failure(
format!("Failed to load compressed xof key set file: {e}"),
format,
)
})?;
let xof_key_set = compressed_xof_key_set
.decompress()
.map_err(|e| test.failure(format!("Failed to decompress the xof key set: {e}"), format))?;
let (pk, server_key) = xof_key_set.into_raw_parts();
test_hl_key_features(&client_key, server_key, Some(&pk), test, format)?;
Ok(test.success(format))
}
pub struct Hl;
impl TestedModule for Hl {
@@ -831,9 +711,6 @@ impl TestedModule for Hl {
TestMetadata::HlCompressedKVStoreTest(test) => {
test_hl_compressed_kv_store_test(test_dir.as_ref(), test, format).into()
}
TestMetadata::HlCompressedXofKeySet(test) => {
test_hl_compressed_xof_key_set_test(test_dir.as_ref(), test, format).into()
}
_ => {
println!("WARNING: missing test: {:?}", testcase.metadata);
TestResult::Skipped(testcase.skip())

View File

@@ -2,8 +2,8 @@
name = "tfhe-benchmark"
version = "0.1.0"
edition = "2021"
homepage = "https://zama.org/"
documentation = "https://docs.zama.org/tfhe-rs"
homepage = "https://zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
license = "BSD-3-Clause-Clear"
description = "tfhe-benchmark: Performances measurements facility for tfhe-rs."
@@ -29,7 +29,6 @@ rand = { workspace = true }
rayon = { workspace = true }
tfhe = { path = "../tfhe", default-features = false }
tfhe-csprng = { path = "../tfhe-csprng" }
tfhe-zk-pok = { path = "../tfhe-zk-pok", optional = true }
cpu-time = "1.0"
num_cpus = "1.17"
gag = "1.0.0"
@@ -40,14 +39,12 @@ boolean = ["tfhe/boolean"]
shortint = ["tfhe/shortint"]
integer = ["shortint", "tfhe/integer"]
gpu = ["tfhe/gpu"]
# gpu enables tfhe-cuda-backend which provides CUDA stream management used by tfhe-zk-pok
gpu-experimental-zk = ["gpu", "zk-pok", "tfhe/gpu-experimental-zk", "tfhe-zk-pok/gpu-experimental"]
hpu = ["tfhe/hpu"]
hpu-v80 = ["tfhe/hpu-v80"]
internal-keycache = ["tfhe/internal-keycache"]
avx512 = ["tfhe/avx512"]
pbs-stats = ["tfhe/pbs-stats"]
zk-pok = ["tfhe/zk-pok", "dep:tfhe-zk-pok"]
zk-pok = ["tfhe/zk-pok"]
[[bench]]
name = "boolean"
@@ -199,12 +196,6 @@ path = "benches/core_crypto/pbs128_bench.rs"
harness = false
required-features = ["shortint", "internal-keycache"]
[[bench]]
name = "zk-msm"
path = "benches/zk/msm.rs"
harness = false
required-features = ["zk-pok"]
[[bin]]
name = "boolean_key_sizes"
path = "src/bin/boolean_key_sizes.rs"

View File

@@ -1,7 +1,7 @@
use benchmark::high_level_api::bench_wait::*;
use benchmark::high_level_api::benchmark_op::*;
use benchmark::utilities::{
get_bench_type, will_this_bench_run, write_to_json, BenchmarkType, OperandType, OperatorType,
get_bench_type, write_to_json, BenchmarkType, OperandType, OperatorType,
};
use criterion::{black_box, Criterion, Throughput};
use rand::prelude::*;
@@ -18,6 +18,34 @@ pub struct BenchConfig<'a> {
pub bit_size: usize,
}
/// This function aims to prevent the setup function from running.
/// `Gag` is used here to suppress the temporary output noise from Criterion.
/// We use a minimal Criterion configuration to retrieve information about the current filter setup.
/// The function returns a boolean indicating whether the current `bench_id` should be executed or
/// not.
pub fn will_this_bench_run(bench_group: &str, bench_id: &str) -> bool {
let mut c = Criterion::default()
.configure_from_args()
.sample_size(10)
.output_directory(&std::env::temp_dir())
.warm_up_time(std::time::Duration::from_nanos(1))
.measurement_time(std::time::Duration::from_nanos(1))
.without_plots();
let mut will_run = false;
{
use gag::Gag;
let _print_gag = Gag::stdout().unwrap();
let _err_gag = Gag::stderr().unwrap();
c.benchmark_group(bench_group)
.bench_function(bench_id, |b| {
b.iter(|| {
will_run = true;
});
});
}
will_run
}
#[inline(never)]
pub fn bench_fhe_type_op<FheType, Op>(
c: &mut Criterion,

View File

@@ -15,7 +15,7 @@ use benchmark::params_aliases::{
#[cfg(feature = "gpu")]
use benchmark::utilities::configure_gpu;
use benchmark::utilities::{
get_bench_type, will_this_bench_run, write_to_json, BenchmarkType, BitSizesSet, EnvConfig,
get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, BitSizesSet, EnvConfig,
OperatorType,
};
use criterion::{Criterion, Throughput};
@@ -51,7 +51,8 @@ fn bench_sns_only_fhe_type<FheType>(
type_name: &str,
num_bits: usize,
) where
FheType: FheEncrypt<u128, ClientKey> + Send + Sync + SquashNoise,
FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
FheType: SquashNoise,
{
let (param, noise_param, _, _) = params;
@@ -102,47 +103,13 @@ fn bench_sns_only_fhe_type<FheType>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
let elements = if will_this_bench_run(type_name, &bench_id) {
#[cfg(feature = "gpu")]
{
use benchmark::utilities::throughput_num_threads;
let params = client_key.computation_parameters();
let num_blocks = num_bits.div_ceil(
(params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize,
);
throughput_num_threads(num_blocks, 4)
}
#[cfg(not(any(feature = "gpu", feature = "hpu")))]
{
use benchmark::high_level_api::find_optimal_batch::find_optimal_batch;
let _ = num_bits; // Avoid clippy warning since FheType::num_bits() is not available.
let setup = |batch_size: usize| {
(0..batch_size)
.map(|_| FheType::encrypt(random(), &client_key))
.collect::<Vec<_>>()
};
let run = |inputs: &Vec<_>, batch_size: usize| {
inputs
.par_iter()
.take(batch_size)
.for_each(|input: &FheType| {
let _ = input.squash_noise();
});
};
find_optimal_batch(run, setup) as u64
}
} else {
0
};
let params = client_key.computation_parameters();
let num_blocks = num_bits
.div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
#[cfg(feature = "gpu")]
{
let elements = throughput_num_threads(num_blocks, 4);
bench_group.throughput(Throughput::Elements(elements));
println!("elements: {elements}");
let gpu_count = get_number_of_gpus() as usize;
@@ -176,6 +143,7 @@ fn bench_sns_only_fhe_type<FheType>(
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
{
let elements = throughput_num_threads(num_blocks, 1);
bench_group.throughput(Throughput::Elements(elements));
println!("elements: {elements}");
bench_group.bench_function(&bench_id, |b| {
@@ -285,51 +253,13 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
let elements = if will_this_bench_run(type_name, &bench_id) {
#[cfg(feature = "gpu")]
{
use benchmark::utilities::throughput_num_threads;
let params = client_key.computation_parameters();
let num_blocks = num_bits.div_ceil(
(params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize,
);
throughput_num_threads(num_blocks, 4)
}
#[cfg(not(any(feature = "gpu", feature = "hpu")))]
{
use benchmark::high_level_api::find_optimal_batch::find_optimal_batch;
let _ = num_bits; // Avoid clippy warning since FheType::num_bits() is not available.
// Noise squashing is the current bottleneck.
// Measuring CPU load with decompression and compression operations alongside
// the noise squash would just increase the batch size. Then benchmark execution
// duration would increase dramatically (from ~1.000 seconds to ~6.000 seconds).
let setup = |batch_size: usize| {
(0..batch_size)
.map(|_| FheType::encrypt(random(), &client_key))
.collect::<Vec<_>>()
};
let run = |inputs: &Vec<_>, batch_size: usize| {
inputs
.par_iter()
.take(batch_size)
.for_each(|input: &FheType| {
let _ = input.squash_noise();
});
};
find_optimal_batch(run, setup) as u64
}
} else {
0
};
let params = client_key.computation_parameters();
let num_blocks = num_bits
.div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
#[cfg(feature = "gpu")]
{
let elements = throughput_num_threads(num_blocks, 4);
bench_group.throughput(Throughput::Elements(elements));
println!("elements: {elements}");
let gpu_count = get_number_of_gpus() as usize;
@@ -376,6 +306,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
{
let elements = throughput_num_threads(num_blocks, 1);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let compressed_values = || {

View File

@@ -485,24 +485,6 @@ mod cuda {
use tfhe::integer::gpu::zk::CudaProvenCompactCiphertextList;
use tfhe::integer::gpu::CudaServerKey;
use tfhe::integer::CompressedServerKey;
use tfhe::GpuIndex;
/// Compute the number of elements for GPU ZK throughput benchmarks.
/// Values are tuned to avoid OOM on H100 GPUs while still saturating the GPU.
/// Memory usage scales with both CRS size and bits being proven.
fn gpu_zk_throughput_elements(crs_size: usize, bits: usize) -> u64 {
match (crs_size, bits) {
// 64-bit CRS: smaller proofs, can handle more elements
(64, _) => 30,
// 2048-bit CRS: moderate memory usage
(2048, b) if b <= 256 => 15,
(2048, _) => 10,
// 4096-bit CRS: largest proofs, most memory intensive
(4096, _) => 6,
// Default fallback for unknown configurations
_ => 10,
}
}
fn gpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
let bench_name = "integer::cuda::zk::pke_zk_verify";
@@ -704,8 +686,12 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
let elements = gpu_zk_throughput_elements(crs_size, *bits)
* get_number_of_gpus() as u64;
let mut elements_per_gpu = 100;
if *bits == 4096 {
elements_per_gpu /= 5;
}
// This value, found empirically, ensure saturation of 8XH100 SXM5
let elements = elements_per_gpu * get_number_of_gpus() as u64;
bench_group.throughput(Throughput::Elements(elements));
bench_id_verify = format!(
@@ -730,38 +716,15 @@ mod cuda {
.collect::<Vec<_>>();
let local_streams = cuda_local_streams(num_block, elements as usize);
let gpu_count = get_number_of_gpus() as usize;
let gpu_sks_vec: Vec<CudaServerKey> = (0..gpu_count)
.map(|gpu_idx| {
let stream =
CudaStreams::new_single_gpu(GpuIndex::new(gpu_idx as u32));
CudaServerKey::decompress_from_cpu(
&compressed_server_key,
&stream,
)
})
.collect();
let d_ksk_material_vec: Vec<CudaKeySwitchingKeyMaterial> = (0
..gpu_count)
.map(|gpu_idx| {
let stream =
CudaStreams::new_single_gpu(GpuIndex::new(gpu_idx as u32));
let d_ksk_material_vec = local_streams
.par_iter()
.map(|local_stream| {
CudaKeySwitchingKeyMaterial::from_key_switching_key(
&ksk, &stream,
&ksk,
local_stream,
)
})
.collect();
let d_ksks: Vec<CudaKeySwitchingKey> = (0..gpu_count)
.map(|gpu_idx| {
CudaKeySwitchingKey::from_cuda_key_switching_key_material(
&d_ksk_material_vec[gpu_idx],
&gpu_sks_vec[gpu_idx],
)
})
.collect();
.collect::<Vec<_>>();
bench_group.bench_function(&bench_id_verify, |b| {
b.iter(|| {
@@ -787,16 +750,17 @@ mod cuda {
|gpu_cts| {
gpu_cts.par_iter().enumerate().for_each
(|(i, gpu_ct)| {
let stream_idx = i % local_streams.len();
let local_stream = &local_streams[stream_idx];
let gpu_idx = i % gpu_count;
let d_ksk = &d_ksks[gpu_idx];
let local_stream = &local_streams[i % local_streams.len()];
let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
let d_ksk =
CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);
gpu_ct
.expand_without_verification(d_ksk, local_stream)
.expand_without_verification(&d_ksk, local_stream)
.unwrap();
});
}, BatchSize::PerIteration);
}, BatchSize::SmallInput);
});
bench_group.bench_function(&bench_id_verify_and_expand, |b| {
@@ -814,18 +778,18 @@ mod cuda {
|gpu_cts| {
gpu_cts.par_iter().enumerate().for_each
(|(i, gpu_ct)| {
let stream_idx = i % local_streams.len();
let local_stream = &local_streams[stream_idx];
let gpu_idx = i % gpu_count;
let d_ksk = &d_ksks[gpu_idx];
let local_stream = &local_streams[i % local_streams.len()];
let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
let d_ksk =
CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);
gpu_ct
.verify_and_expand(
&crs, &pk, &metadata, d_ksk, local_stream,
&crs, &pk, &metadata, &d_ksk, local_stream,
)
.unwrap();
});
}, BatchSize::PerIteration);
}, BatchSize::SmallInput);
});
}
}
@@ -852,154 +816,11 @@ mod cuda {
bench_group.finish()
}
fn gpu_pke_zk_proof(c: &mut Criterion) {
let bench_name = "zk::cuda::pke_zk_proof";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let params: [(
CompactPublicKeyEncryptionParameters,
ShortintKeySwitchingParameters,
PBSParameters,
); 2] = [
(
PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
),
(
BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
),
];
for (param_pke, _param_ksk, param_fhe) in params.iter() {
let param_name = param_fhe.name();
let param_name = param_name.as_str();
let cks = ClientKey::new(*param_fhe);
let sks = ServerKey::new_radix_server_key(&cks);
let compact_private_key = CompactPrivateKey::new(*param_pke);
let pk = CompactPublicKey::new(&compact_private_key);
// Kept for consistency
let _casting_key =
KeySwitchingKey::new((&compact_private_key, None), (&cks, &sks), *_param_ksk);
// We have a use case with 320 bits of metadata
let mut metadata = [0u8; (320 / u8::BITS) as usize];
let mut rng = rand::thread_rng();
metadata.fill_with(|| rng.gen());
let zk_vers = param_pke.zk_scheme;
for proof_config in default_proof_config().iter() {
let msg_bits =
(param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize;
println!("Generating CRS... ");
let crs_size = proof_config.crs_size;
let crs = CompactPkeCrs::from_shortint_params(
*param_pke,
LweCiphertextCount(crs_size / msg_bits),
)
.unwrap();
for bits in proof_config.bits_to_prove.iter() {
assert_eq!(bits % 64, 0);
// Packing, so we take the message and carry modulus to compute our block count
let num_block = 64usize.div_ceil(msg_bits);
let fhe_uint_count = bits / 64;
for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
let zk_load = match compute_load {
ZkComputeLoad::Proof => "compute_load_proof",
ZkComputeLoad::Verify => "compute_load_verify",
};
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
bench_id = format!(
"{bench_name}::{param_name}_{bits}_bits_packed_{crs_size}_bits_crs_{zk_load}_ZK{zk_vers:?}"
);
bench_group.bench_function(&bench_id, |b| {
let input_msg = rng.gen::<u64>();
let messages = vec![input_msg; fhe_uint_count];
b.iter(|| {
let _ct1 =
tfhe::integer::ProvenCompactCiphertextList::builder(
&pk,
)
.extend(messages.iter().copied())
.build_with_proof_packed(&crs, &metadata, compute_load)
.unwrap();
})
});
}
BenchmarkType::Throughput => {
// The zk proof is currently not pooled, so we simply use the number
// of threads as heuristic for the
// batch size
let elements =
(rayon::current_num_threads() / num_block).max(1) + 1;
bench_group.throughput(Throughput::Elements(elements as u64));
bench_id = format!(
"{bench_name}::throughput::{param_name}_{bits}_bits_packed_{crs_size}_bits_crs_{zk_load}_ZK{zk_vers:?}"
);
bench_group.bench_function(&bench_id, |b| {
let messages = (0..elements)
.map(|_| {
let input_msg = rng.gen::<u64>();
vec![input_msg; fhe_uint_count]
})
.collect::<Vec<_>>();
b.iter(|| {
messages.par_iter().for_each(|msg| {
tfhe::integer::ProvenCompactCiphertextList::builder(
&pk,
)
.extend(msg.iter().copied())
.build_with_proof_packed(&crs, &metadata, compute_load)
.unwrap();
})
})
});
}
}
let shortint_params: PBSParameters = *param_fhe;
write_to_json::<u64, _>(
&bench_id,
shortint_params,
param_name,
"pke_zk_proof",
&OperatorType::Atomic,
shortint_params.message_modulus().0 as u32,
vec![shortint_params.message_modulus().0.ilog2(); num_block],
);
}
}
}
}
}
pub fn gpu_zk_verify() {
let results_file = Path::new("gpu_pke_zk_crs_sizes.csv");
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
gpu_pke_zk_verify(&mut criterion, results_file);
}
pub fn gpu_zk_proof() {
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
gpu_pke_zk_proof(&mut criterion);
}
}
pub fn zk_verify_and_proof() {
@@ -1010,14 +831,11 @@ pub fn zk_verify_and_proof() {
}
#[cfg(all(feature = "gpu", feature = "zk-pok"))]
use crate::cuda::{gpu_zk_proof, gpu_zk_verify};
use crate::cuda::gpu_zk_verify;
fn main() {
#[cfg(all(feature = "gpu", feature = "zk-pok"))]
{
gpu_zk_proof();
gpu_zk_verify();
}
gpu_zk_verify();
#[cfg(not(feature = "gpu"))]
zk_verify_and_proof();

View File

@@ -1,406 +0,0 @@
//! Benchmark comparing CPU MSM vs GPU MSM for BLS12-446
//!
//! This benchmark measures the performance of multi-scalar multiplication (MSM)
//! for both G1 and G2 points on the BLS12-446 curve.
//!
//! CPU benchmarks use the arkworks-based `G1Affine::multi_mul_scalar` /
//! `G2Affine::multi_mul_scalar`. GPU benchmarks (gated behind the
//! `gpu-experimental-zk` feature) call `tfhe_zk_pok::gpu::g1_msm_gpu` /
//! `tfhe_zk_pok::gpu::g2_msm_gpu` directly, which dispatch to the
//! zk-cuda-backend.
//!
//! ## Running the benchmarks
//!
//! ```bash
//! # CPU only
//! cargo bench --package tfhe-benchmark --bench zk-msm
//!
//! # CPU and GPU
//! cargo bench --package tfhe-benchmark --bench zk-msm --features gpu-experimental-zk
//! ```
use benchmark::utilities::{
get_bench_type, write_to_json, BenchmarkType, CryptoParametersRecord, OperatorType,
};
use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion, Throughput};
use rand::rngs::StdRng;
use rand::SeedableRng;
use rayon::prelude::*;
use std::time::Duration;
use tfhe_zk_pok::curve_api::bls12_446::{G1Affine, G2Affine, Zp, G1, G2};
use tfhe_zk_pok::curve_api::CurveGroupOps;
/// Compute the number of parallel elements for MSM throughput benchmarks.
/// Uses aggressive values to maximize throughput testing while keeping setup time reasonable.
fn msm_throughput_elements(input_size: usize) -> u64 {
match input_size {
n if n <= 1000 => 64,
n if n <= 4096 => 32,
_ => 16,
}
}
/// Generate random G1 affine points using tfhe-zk-pok
fn generate_g1_affine_points(rng: &mut StdRng, n: usize) -> Vec<G1Affine> {
(0..n)
.map(|_| {
let point = G1::GENERATOR.mul_scalar(Zp::rand(rng));
point.normalize()
})
.collect()
}
/// Generate random G2 affine points using tfhe-zk-pok
fn generate_g2_affine_points(rng: &mut StdRng, n: usize) -> Vec<G2Affine> {
(0..n)
.map(|_| {
let point = G2::GENERATOR.mul_scalar(Zp::rand(rng));
point.normalize()
})
.collect()
}
/// Generate random scalars using tfhe-zk-pok
fn generate_scalars(rng: &mut StdRng, n: usize) -> Vec<Zp> {
(0..n).map(|_| Zp::rand(rng)).collect()
}
/// Benchmark CPU MSM for G1 points using tfhe-zk-pok entry points
fn bench_cpu_g1_msm(c: &mut Criterion) {
let curve_name = "bls12_446";
let subgroup_name = "G1";
let bench_name = format!("zk::msm::{curve_name}::{subgroup_name}");
let mut group = c.benchmark_group(&bench_name);
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
for size in [100, 1000, 2048, 4096, 10000].iter() {
let n = *size;
let bench_id;
let bench_shortname = "zk::msm::bls12_446::g1";
match get_bench_type() {
BenchmarkType::Latency => {
let mut rng = StdRng::seed_from_u64(42);
let bases = generate_g1_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
bench_id = format!("{bench_name}::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
b.iter(|| {
let result =
G1Affine::multi_mul_scalar(black_box(&bases), black_box(&scalars));
black_box(result)
});
});
}
BenchmarkType::Throughput => {
let elements = msm_throughput_elements(n);
group.throughput(Throughput::Elements(elements));
bench_id = format!("{bench_name}::throughput::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
// Setup generates test data in parallel, excluded from measurement
let setup = || {
(0..elements)
.into_par_iter()
.map(|i| {
let mut rng = StdRng::seed_from_u64(42 + i);
let bases = generate_g1_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
(bases, scalars)
})
.collect::<Vec<_>>()
};
b.iter_batched(
setup,
|test_data| {
test_data.par_iter().for_each(|(bases, scalars)| {
let result = G1Affine::multi_mul_scalar(
black_box(bases),
black_box(scalars),
);
black_box(result);
});
},
BatchSize::LargeInput,
);
});
}
}
// MSM benchmarks are curve operations, use minimal parameters
let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
write_to_json(
&bench_id,
params,
"MSM_BLS12_446_G1",
bench_shortname,
&OperatorType::Atomic,
64, // bit_size for curve scalar operations
vec![], // decomposition_basis not applicable for MSM
);
}
group.finish();
}
/// Benchmark CPU MSM for G2 points using tfhe-zk-pok entry points
fn bench_cpu_g2_msm(c: &mut Criterion) {
let curve_name = "bls12_446";
let subgroup_name = "G2";
let bench_name = format!("zk::msm::{curve_name}::{subgroup_name}");
let mut group = c.benchmark_group(&bench_name);
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
for size in [100, 1000, 2048, 4096, 10000].iter() {
let n = *size;
let bench_id;
let bench_shortname = "zk::msm::bls12_446::g2";
match get_bench_type() {
BenchmarkType::Latency => {
let mut rng = StdRng::seed_from_u64(42);
let bases = generate_g2_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
bench_id = format!("{bench_name}::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
b.iter(|| {
let result =
G2Affine::multi_mul_scalar(black_box(&bases), black_box(&scalars));
black_box(result)
});
});
}
BenchmarkType::Throughput => {
let elements = msm_throughput_elements(n);
group.throughput(Throughput::Elements(elements));
bench_id = format!("{bench_name}::throughput::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
// Setup generates test data in parallel, excluded from measurement
let setup = || {
(0..elements)
.into_par_iter()
.map(|i| {
let mut rng = StdRng::seed_from_u64(42 + i);
let bases = generate_g2_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
(bases, scalars)
})
.collect::<Vec<_>>()
};
b.iter_batched(
setup,
|test_data| {
test_data.par_iter().for_each(|(bases, scalars)| {
let result = G2Affine::multi_mul_scalar(
black_box(bases),
black_box(scalars),
);
black_box(result);
});
},
BatchSize::LargeInput,
);
});
}
}
// MSM benchmarks are curve operations, use minimal parameters
let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
write_to_json(
&bench_id,
params,
"MSM_BLS12_446_G2",
bench_shortname,
&OperatorType::Atomic,
64, // bit_size for curve scalar operations
vec![], // decomposition_basis not applicable for MSM
);
}
group.finish();
}
/// Benchmark GPU MSM for G1 points via `tfhe_zk_pok::gpu::g1_msm_gpu`
#[cfg(feature = "gpu-experimental-zk")]
fn bench_gpu_g1_msm(c: &mut Criterion) {
use tfhe_zk_pok::gpu::{g1_msm_gpu, select_gpu_for_msm};
let curve_name = "bls12_446";
let subgroup_name = "G1";
let bench_name = format!("zk::cuda::msm::{curve_name}::{subgroup_name}");
let mut group = c.benchmark_group(&bench_name);
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
// Resolve GPU index once — stream creation/destruction is handled inside g1_msm_gpu
let gpu_index = select_gpu_for_msm();
for size in [100, 1000, 2048, 4096, 10000].iter() {
let n = *size;
let bench_id;
let bench_shortname = "zk::cuda::msm::bls12_446::g1";
match get_bench_type() {
BenchmarkType::Latency => {
let mut rng = StdRng::seed_from_u64(42);
let bases = generate_g1_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
bench_id = format!("{bench_name}::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
b.iter(|| {
let result = g1_msm_gpu(black_box(&bases), black_box(&scalars), gpu_index);
black_box(result)
});
});
}
BenchmarkType::Throughput => {
let elements = msm_throughput_elements(n);
group.throughput(Throughput::Elements(elements));
bench_id = format!("{bench_name}::throughput::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
let setup = || {
(0..elements)
.into_par_iter()
.map(|i| {
let mut rng = StdRng::seed_from_u64(42 + i);
let bases = generate_g1_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
(bases, scalars)
})
.collect::<Vec<_>>()
};
b.iter_batched(
setup,
|test_data| {
test_data.par_iter().for_each(|(bases, scalars)| {
let result =
g1_msm_gpu(black_box(bases), black_box(scalars), gpu_index);
black_box(result);
});
},
BatchSize::LargeInput,
);
});
}
}
let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
write_to_json(
&bench_id,
params,
"MSM_BLS12_446_G1_CUDA",
bench_shortname,
&OperatorType::Atomic,
64, // bit_size for curve scalar operations
vec![], // decomposition_basis not applicable for MSM
);
}
group.finish();
}
/// Benchmark GPU MSM for G2 points via `tfhe_zk_pok::gpu::g2_msm_gpu`
#[cfg(feature = "gpu-experimental-zk")]
fn bench_gpu_g2_msm(c: &mut Criterion) {
use tfhe_zk_pok::gpu::{g2_msm_gpu, select_gpu_for_msm};
let curve_name = "bls12_446";
let subgroup_name = "G2";
let bench_name = format!("zk::cuda::msm::{curve_name}::{subgroup_name}");
let mut group = c.benchmark_group(&bench_name);
group.sample_size(10);
group.measurement_time(Duration::from_secs(30));
let gpu_index = select_gpu_for_msm();
for size in [100, 1000, 2048, 4096, 10000].iter() {
let n = *size;
let bench_id;
let bench_shortname = "zk::cuda::msm::bls12_446::g2";
match get_bench_type() {
BenchmarkType::Latency => {
let mut rng = StdRng::seed_from_u64(42);
let bases = generate_g2_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
bench_id = format!("{bench_name}::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
b.iter(|| {
let result = g2_msm_gpu(black_box(&bases), black_box(&scalars), gpu_index);
black_box(result)
});
});
}
BenchmarkType::Throughput => {
let elements = msm_throughput_elements(n);
group.throughput(Throughput::Elements(elements));
bench_id = format!("{bench_name}::throughput::{n}");
group.bench_with_input(&bench_id, &n, |b, _| {
let setup = || {
(0..elements)
.into_par_iter()
.map(|i| {
let mut rng = StdRng::seed_from_u64(42 + i);
let bases = generate_g2_affine_points(&mut rng, n);
let scalars = generate_scalars(&mut rng, n);
(bases, scalars)
})
.collect::<Vec<_>>()
};
b.iter_batched(
setup,
|test_data| {
test_data.par_iter().for_each(|(bases, scalars)| {
let result =
g2_msm_gpu(black_box(bases), black_box(scalars), gpu_index);
black_box(result);
});
},
BatchSize::LargeInput,
);
});
}
}
let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
write_to_json(
&bench_id,
params,
"MSM_BLS12_446_G2_CUDA",
bench_shortname,
&OperatorType::Atomic,
64, // bit_size for curve scalar operations
vec![], // decomposition_basis not applicable for MSM
);
}
group.finish();
}
// CPU benchmarks (always available)
criterion_group!(benches_cpu, bench_cpu_g1_msm, bench_cpu_g2_msm,);
// GPU benchmarks (only when GPU feature is enabled)
#[cfg(feature = "gpu-experimental-zk")]
criterion_group!(benches_gpu, bench_gpu_g1_msm, bench_gpu_g2_msm,);
// Conditionally include GPU benchmarks in main
#[cfg(feature = "gpu-experimental-zk")]
criterion_main!(benches_cpu, benches_gpu);
#[cfg(not(feature = "gpu-experimental-zk"))]
criterion_main!(benches_cpu);

View File

@@ -1,4 +1,3 @@
use criterion::Criterion;
use serde::Serialize;
use std::path::PathBuf;
use std::sync::OnceLock;
@@ -549,34 +548,6 @@ where
factor as usize
}
/// This function aims to prevent the setup function from running.
/// `Gag` is used here to suppress the temporary output noise from Criterion.
/// We use a minimal Criterion configuration to retrieve information about the current filter setup.
/// The function returns a boolean indicating whether the current `bench_id` should be executed or
/// not.
pub fn will_this_bench_run(bench_group: &str, bench_id: &str) -> bool {
let mut c = Criterion::default()
.configure_from_args()
.sample_size(10)
.output_directory(&std::env::temp_dir())
.warm_up_time(std::time::Duration::from_nanos(1))
.measurement_time(std::time::Duration::from_nanos(1))
.without_plots();
let mut will_run = false;
{
use gag::Gag;
let _print_gag = Gag::stdout().unwrap();
let _err_gag = Gag::stderr().unwrap();
c.benchmark_group(bench_group)
.bench_function(bench_id, |b| {
b.iter(|| {
will_run = true;
});
});
}
will_run
}
#[cfg(feature = "gpu")]
mod cuda_utils {
use tfhe::core_crypto::entities::{

View File

@@ -4,8 +4,8 @@ version = "0.8.0"
edition = "2021"
license = "BSD-3-Clause-Clear"
description = "Cryptographically Secure PRNG used in the TFHE-rs library."
homepage = "https://zama.org/"
documentation = "https://docs.zama.org/tfhe-rs"
homepage = "https://zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
readme = "README.md"
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

View File

@@ -20,4 +20,4 @@ RUSTFLAGS="-Ctarget-cpu=native" cargo bench
## License
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
please contact us at `hello@zama.org`.
please contact us at `hello@zama.ai`.

View File

@@ -6,7 +6,7 @@ description = "tfhe-fft is a pure Rust high performance fast Fourier transform l
readme = "README.md"
repository = "https://github.com/zama-ai/tfhe-rs"
license = "BSD-3-Clause-Clear"
homepage = "https://zama.org/"
homepage = "https://zama.ai/"
keywords = ["fft"]
[dependencies]

View File

@@ -1,6 +1,6 @@
tfhe-fft is a pure Rust high performance fast Fourier transform library
that processes vectors of sizes that are powers of two. It was made to be used
as a backend in Zama's [TFHE-rs](https://docs.zama.org/tfhe-rs) library.
as a backend in Zama's [TFHE-rs](https://docs.zama.ai/tfhe-rs) library.
This library provides two FFT modules:
- The ordered module FFT applies a forward/inverse FFT that takes its input in standard
@@ -69,7 +69,7 @@ fn main() {
## Links
- [Zama](https://www.zama.org/)
- [Zama](https://www.zama.ai/)
- [TFHE-rs Sources](https://github.com/zama-ai/tfhe-rs)
## License
@@ -81,4 +81,4 @@ prototyping purposes, as well as for your personal projects.
If you want to use tfhe-fft in a commercial product however, you will need to
purchase a separate commercial licence.
If you have any questions, please contact us at `hello@zama.org.`
If you have any questions, please contact us at `hello@zama.ai.`

View File

@@ -6,7 +6,7 @@ description = "tfhe-ntt is a pure Rust high performance number theoretic transfo
readme = "README.md"
repository = "https://github.com/zama-ai/tfhe-rs"
license = "BSD-3-Clause-Clear"
homepage = "https://zama.org/"
homepage = "https://zama.ai/"
keywords = ["ntt"]
rust-version.workspace = true

View File

@@ -3,8 +3,8 @@ name = "tfhe-zk-pok"
version = "0.8.0"
edition = "2021"
keywords = ["zero", "knowledge", "proof", "vector-commitments"]
homepage = "https://zama.org/"
documentation = "https://docs.zama.org/tfhe-rs"
homepage = "https://zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
license = "BSD-3-Clause-Clear"
description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryption for TFHE."
@@ -14,8 +14,8 @@ rust-version.workspace = true
[dependencies]
ark-bls12-381 = "0.5.0"
ark-ec = { workspace = true, features = ["parallel"] }
ark-ff = { workspace = true, features = ["parallel"] }
ark-ec = { version = "0.5.0", features = ["parallel"] }
ark-ff = { version = "0.5.0", features = ["parallel"] }
ark-poly = { version = "0.5.0", features = ["parallel"] }
rand = { workspace = true }
rayon = { workspace = true }
@@ -24,13 +24,9 @@ serde = { workspace = true, features = ["default", "derive"] }
zeroize = "1.7.0"
num-bigint = "0.4.5"
tfhe-versionable = { version = "0.7.0", path = "../utils/tfhe-versionable" }
zk-cuda-backend = { version = "0.1.0", path = "../backends/zk-cuda-backend", optional = true }
tfhe-cuda-backend = { version = "0.13.0", path = "../backends/tfhe-cuda-backend", optional = true }
itertools.workspace = true
[features]
experimental = []
gpu-experimental = ["dep:zk-cuda-backend", "dep:tfhe-cuda-backend"]
[dev-dependencies]
serde_json = "~1.0"

View File

@@ -91,110 +91,5 @@ fn bench_pke_v1_verify(c: &mut Criterion) {
}
}
#[cfg(feature = "gpu-experimental")]
mod gpu {
use super::*;
use tfhe_zk_pok::proofs::pke;
pub fn bench_pke_v1_prove_gpu(c: &mut Criterion) {
let bench_shortname = "pke_zk_proof_v1";
let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
let mut bench_group = c.benchmark_group(&bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let rng = &mut rand::thread_rng();
for (params, param_name) in [
(PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
(PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
] {
let (public_param, public_commit, private_commit, metadata) = init_params_v1(params);
let effective_t = params.t >> 1;
let bits = (params.k as u32) * effective_t.ilog2();
for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
let bench_id = format!("{bench_name}::{param_name}_{bits}_bits_packed_{load}");
let seed: u128 = rng.gen();
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
pke::gpu::prove(
(&public_param, &public_commit),
&private_commit,
&metadata,
load,
&seed.to_le_bytes(),
)
})
});
write_to_json(&bench_id, params, param_name, bench_shortname);
}
}
}
pub fn bench_pke_v1_verify_gpu(c: &mut Criterion) {
let bench_shortname = "pke_zk_verify_v1";
let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
let mut bench_group = c.benchmark_group(&bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let rng = &mut rand::thread_rng();
for (params, param_name) in [
(PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
(PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
] {
let (public_param, public_commit, private_commit, metadata) = init_params_v1(params);
let effective_t = params.t >> 1;
let bits = (params.k as u32) * effective_t.ilog2();
for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
let bench_id = format!("{bench_name}::{param_name}_{bits}_bits_packed_{load}");
let seed: u128 = rng.gen();
// Use GPU prove to generate the proof
let proof = pke::gpu::prove(
(&public_param, &public_commit),
&private_commit,
&metadata,
load,
&seed.to_le_bytes(),
);
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
pke::gpu::verify(&proof, (&public_param, &public_commit), &metadata)
.unwrap();
})
});
write_to_json(&bench_id, params, param_name, bench_shortname);
}
}
}
}
criterion_group!(benches_pke_v1, bench_pke_v1_verify, bench_pke_v1_prove);
#[cfg(feature = "gpu-experimental")]
use gpu::{bench_pke_v1_prove_gpu, bench_pke_v1_verify_gpu};
#[cfg(feature = "gpu-experimental")]
criterion_group!(
benches_pke_v1_gpu,
bench_pke_v1_verify_gpu,
bench_pke_v1_prove_gpu
);
#[cfg(feature = "gpu-experimental")]
criterion_main!(benches_pke_v1, benches_pke_v1_gpu);
#[cfg(not(feature = "gpu-experimental"))]
criterion_main!(benches_pke_v1);

View File

@@ -107,130 +107,5 @@ fn bench_pke_v2_verify(c: &mut Criterion) {
}
}
#[cfg(feature = "gpu-experimental")]
mod gpu {
use super::*;
use tfhe_zk_pok::proofs::pke_v2;
pub fn bench_pke_v2_prove_gpu(c: &mut Criterion) {
let bench_shortname = "pke_zk_proof_v2";
let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
let mut bench_group = c.benchmark_group(&bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let rng = &mut rand::thread_rng();
for ((params, param_name), load, bound) in itertools::iproduct!(
[
(PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
(PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
],
[ComputeLoad::Proof, ComputeLoad::Verify],
[Bound::CS, Bound::GHL]
) {
let (public_param, public_commit, private_commit, metadata) =
init_params_v2(params, bound);
let effective_t = params.t >> 1;
let bits = (params.k as u32) * effective_t.ilog2();
let bench_id =
format!("{bench_name}::{param_name}_{bits}_bits_packed_{load}_{bound:?}");
println!("{bench_id}");
let seed: u128 = rng.gen();
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
pke_v2::gpu::prove(
(&public_param, &public_commit),
&private_commit,
&metadata,
load,
&seed.to_le_bytes(),
)
})
});
write_to_json(&bench_id, params, param_name, bench_shortname);
}
}
pub fn bench_pke_v2_verify_gpu(c: &mut Criterion) {
let bench_shortname = "pke_zk_verify_v2";
let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
let mut bench_group = c.benchmark_group(&bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let rng = &mut rand::thread_rng();
for ((params, param_name), load, bound, pairing_mode) in itertools::iproduct!(
[
(PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
(PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
],
[ComputeLoad::Proof, ComputeLoad::Verify],
[Bound::CS, Bound::GHL],
[
VerificationPairingMode::TwoSteps,
VerificationPairingMode::Batched
]
) {
let (public_param, public_commit, private_commit, metadata) =
init_params_v2(params, bound);
let effective_t = params.t >> 1;
let bits = (params.k as u32) * effective_t.ilog2();
let bench_id = format!(
"{bench_name}::{param_name}_{bits}_bits_packed_{load}_{bound:?}_{pairing_mode:?}"
);
println!("{bench_id}");
let seed: u128 = rng.gen();
// Use GPU prove to generate the proof
let proof = pke_v2::gpu::prove(
(&public_param, &public_commit),
&private_commit,
&metadata,
load,
&seed.to_le_bytes(),
);
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
pke_v2::gpu::verify(
&proof,
(&public_param, &public_commit),
&metadata,
pairing_mode,
)
.unwrap();
})
});
write_to_json(&bench_id, params, param_name, bench_shortname);
}
}
}
criterion_group!(benches_pke_v2, bench_pke_v2_verify, bench_pke_v2_prove);
#[cfg(feature = "gpu-experimental")]
use gpu::{bench_pke_v2_prove_gpu, bench_pke_v2_verify_gpu};
#[cfg(feature = "gpu-experimental")]
criterion_group!(
benches_pke_v2_gpu,
bench_pke_v2_verify_gpu,
bench_pke_v2_prove_gpu
);
#[cfg(feature = "gpu-experimental")]
criterion_main!(benches_pke_v2, benches_pke_v2_gpu);
#[cfg(not(feature = "gpu-experimental"))]
criterion_main!(benches_pke_v2);

Some files were not shown because too many files have changed in this diff Show More