Compare commits

..

3 Commits

Author SHA1 Message Date
bbarbakadze
bf4b3a2b83 feat: use signed digits to reduce bucket size 2026-04-16 19:38:50 +04:00
bbarbakadze
b04fb4b94b feat: implement extended Jacobian 2026-04-16 17:47:51 +04:00
bbarbakadze
e716051049 feat(gpu): optimize BLS12-446 field arithmetic for MSM performance
- Replace 64-bit CIOS Montgomery multiplication with 32-bit MAD chains
    (mad.lo.cc/madc.hi.cc), exploiting native 2x throughput of 32-bit ops
    on NVIDIA GPUs via even/odd accumulator separation

  - Add fp_mont_sqr using a triangular MAD chain (upper triangle computed
    once and doubled, diagonal added separately), saving of the
    multiplications versus treating squaring as a general multiplication

  - Add fp_add_lazy/fp_sub_lazy (and Fp2 variants): skip the final
    conditional subtraction when the result feeds fp_mont_mul, which
    accepts inputs in [0, 2p). Wired into fp2_mont_mul, fp2_mont_square,
    and G1/G2 projective_point_double

  - Replace all fp_mont_mul(c, a, a) squaring patterns with fp_mont_sqr
    across curve.cu and fp2.cu (point addition, doubling, inversion)
2026-04-15 15:24:52 +04:00
265 changed files with 2812 additions and 16229 deletions

View File

@@ -4,6 +4,9 @@ ignore = [
"RUSTSEC-2024-0436",
# Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
"RUSTSEC-2025-0141",
# Ignoring unsoundness in 'rand' with custom logger. Rand update is currently blocked by
# arkworks and we do not use custom loggers.
"RUSTSEC-2026-0097",
]
[output]

View File

@@ -54,7 +54,7 @@ jobs:
- name: Retrieve data from cache
id: retrieve-data-cache
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
- name: Store data in cache
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
continue-on-error: true
uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor

View File

@@ -69,7 +69,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
dependencies:
@@ -200,7 +200,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: |
~/.nvm
@@ -213,7 +213,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |

View File

@@ -56,7 +56,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
integer:

View File

@@ -57,7 +57,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
integer:

View File

@@ -78,7 +78,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
dependencies:

View File

@@ -45,7 +45,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
wasm:
@@ -92,7 +92,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: |
~/.nvm
@@ -105,7 +105,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |

View File

@@ -34,7 +34,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
backward:
@@ -79,11 +79,19 @@ jobs:
exit 1
fi
- name: Post/refresh backward-compat report
- name: Find existing comment
if: steps.report.outputs.has_report == 'true'
uses: marocchino/sticky-pull-request-comment@0ea0beb66eb9baf113663a64ec522f60e49231c0
id: find-comment
uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4.0.0
with:
header: backward-compat-snapshot
hide_and_recreate: true
hide_classify: OUTDATED
path: report.md
issue-number: ${{ github.event.pull_request.number }}
body-includes: '**Backward-compat snapshot:'
- name: Comment on PR
if: steps.report.outputs.has_report == 'true'
uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0
with:
comment-id: ${{ steps.find-comment.outputs.comment-id }}
issue-number: ${{ github.event.pull_request.number }}
body-path: report.md
edit-mode: replace

View File

@@ -223,7 +223,7 @@ jobs:
results_type: ${{ inputs.additional_results_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -99,7 +99,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_ct_key_sizes
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -89,7 +89,7 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_integer_multi_bit_gpu_default
path: ${{ env.RESULTS_FILENAME }}
@@ -173,7 +173,7 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -270,7 +270,7 @@ jobs:
filenames: ${{ inputs.additional_file_to_parse }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -204,7 +204,7 @@ jobs:
uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10
- name: Cache cargo
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
with:
path: |
~/.cargo/registry
@@ -232,7 +232,7 @@ jobs:
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
- name: Use Node.js
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
with:
node-version: 20.x
@@ -271,7 +271,7 @@ jobs:
- name: Upload profile artifact
env:
REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ env.REPORT_NAME }}
path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}
@@ -302,7 +302,7 @@ jobs:
working-directory: fhevm/
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${COMMIT_SHA}_${BENCHMARKS}_${{ needs.parse-inputs.outputs.profile }}
path: fhevm/$${{ env.RESULTS_FILENAME }}

View File

@@ -185,7 +185,7 @@ jobs:
BENCH_TYPE: ${{ matrix.bench_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_${{ matrix.bench_type }}_${{ matrix.command }}_benchmarks
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -280,7 +280,7 @@ jobs:
BENCH_TYPE: ${{ env.__TFHE_RS_BENCH_TYPE }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_regression_${{ env.RESULTS_FILE_SHA }} # RESULT_FILE_SHA is needed to avoid collision between matrix.command runs
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -99,7 +99,7 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_fft
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -99,7 +99,7 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_ntt
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -46,7 +46,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
wasm_bench:

View File

@@ -124,7 +124,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: |
~/.nvm
@@ -137,7 +137,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |
@@ -180,7 +180,7 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_wasm_${{ matrix.browser }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -138,7 +138,7 @@ jobs:
- name: Node cache restoration
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
id: node-cache
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: |
~/.nvm
@@ -151,7 +151,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
with:
path: |

View File

@@ -40,7 +40,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
fft:

View File

@@ -42,7 +42,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
ntt:

View File

@@ -43,7 +43,7 @@ jobs:
echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"
- name: Check workflows security
uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3
uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2
with:
advanced-security: 'false' # Print results directly in logs
persona: pedantic

View File

@@ -44,7 +44,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
tfhe:

View File

@@ -46,7 +46,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
csprng:

View File

@@ -87,7 +87,7 @@ jobs:
- name: Upload tables
if: inputs.backend_comparison == false
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_subset_${{inputs.bench_subset}}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
# This will upload all the file generated
@@ -111,7 +111,7 @@ jobs:
- name: Upload comparison tables
if: inputs.backend_comparison == true
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
with:
name: ${{ github.sha }}_backends_comparison_tables
# This will upload all the file generated

View File

@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -47,7 +47,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -45,7 +45,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -47,7 +47,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:

View File

@@ -41,7 +41,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
hpu:

View File

@@ -62,7 +62,7 @@ jobs:
PACKAGE: ${{ inputs.package-name }}
run: |
cargo package -p "${PACKAGE}"
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: crate-${{ inputs.package-name }}
path: target/package/*.crate

View File

@@ -128,7 +128,7 @@ jobs:
run: |
cargo package -p "${PACKAGE}"
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: crate-${{ inputs.package-name }}
path: target/package/*.crate
@@ -196,13 +196,6 @@ jobs:
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
persist-credentials: "false"
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Download artifact
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
@@ -217,12 +210,12 @@ jobs:
env:
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
PACKAGE: ${{ inputs.package-name }}
DRY_RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
DRY-RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# dry-run expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since dry-run is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p "${PACKAGE}" ${DRY_RUN}
cargo publish -p "${PACKAGE}" ${DRY-RUN}
- name: Generate hash
id: published_hash
@@ -262,7 +255,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -89,7 +89,7 @@ jobs:
make build_web_js_api_parallel
- name: Authenticate on NPM
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
with:
node-version: '24'
registry-url: 'https://registry.npmjs.org'

View File

@@ -53,7 +53,7 @@ jobs:
- name: Restore Sagemath image from cache
id: docker-cache
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: /tmp/sagemath_image
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
- name: Store Sagemath image in cache
if: steps.docker-cache.outputs.cache-hit != 'true'
continue-on-error: true
uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
with:
path: /tmp/sagemath_image
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}

View File

@@ -312,7 +312,7 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
| grep -v '/cmake-build-debug/' \
| grep -v '/build/' \
| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
venv/bin/python3 "scripts/check_scratch_cleanup.py"
.PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
@@ -360,7 +360,7 @@ check_fmt_toml: install_taplo
.PHONY: check_typos # Check for typos in codebase
check_typos: install_typos_checker
@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" ":!*.hpu" | typos --file-list - && echo "No typos found"
@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" | typos --file-list - && echo "No typos found"
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
@@ -587,17 +587,6 @@ clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selec
echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
fi
.PHONY: check_backward_compat_locks_did_not_change # Check backward compat Cargo.lock files are up to date
check_backward_compat_locks_did_not_change: install_rs_check_toolchain
@for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
echo "checking Cargo.lock for $$crate"; \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate metadata --locked --format-version 1 > /dev/null || \
( echo "Cargo.lock for $$crate is out of date. Update it with:" && \
echo " cd $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate && cargo metadata --format-version 1 > /dev/null" && \
echo "then commit the updated Cargo.lock." && exit 1 ); \
done
.PHONY: clippy_test_vectors # Run clippy lints on the test vectors app
clippy_test_vectors: install_rs_check_toolchain
cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -1794,13 +1783,6 @@ bench_boolean: install_rs_check_toolchain
--bench boolean \
--features=boolean,internal-keycache -p tfhe-benchmark
.PHONY: bench_common_mask # Run benchmarks for CM-PBS
bench_common_mask: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench cm-bench \
--features=experimental -p tfhe-benchmark
.PHONY: bench_ks # Run benchmarks for keyswitch
bench_ks: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -2283,7 +2265,6 @@ pcc_batch_5:
$(call run_recipe_with_details,clippy_tfhe_lints)
$(call run_recipe_with_details,check_compile_tests)
$(call run_recipe_with_details,clippy_backward_compat_data)
$(call run_recipe_with_details,check_backward_compat_locks_did_not_change)
.PHONY: pcc_batch_6 # duration: 6'32''
pcc_batch_6:

View File

@@ -1,14 +1,5 @@
use std::path::PathBuf;
fn get_linux_distribution_name() -> Option<String> {
let content = std::fs::read_to_string("/etc/os-release").ok()?;
for line in content.lines() {
if let Some(value) = line.strip_prefix("NAME=") {
return Some(value.trim_matches('"').to_string());
}
}
None
}
use std::process::Command;
fn main() {
if let Ok(val) = std::env::var("DOCS_RS") {
@@ -37,7 +28,9 @@ fn main() {
println!("cargo::rerun-if-changed=src");
if std::env::consts::OS == "linux" {
if get_linux_distribution_name().as_deref() != Some("Ubuntu") {
let output = Command::new("./get_os_name.sh").output().unwrap();
let distribution = String::from_utf8(output.stdout).unwrap();
if distribution != "Ubuntu\n" {
println!(
"cargo:warning=This Linux distribution is not officially supported. \
Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"

View File

@@ -721,7 +721,7 @@ void cuda_integer_grouped_oprf_custom_range_64_async(
uint32_t num_blocks_intermediate, const void *seeded_lwe_input,
const uint64_t *decomposed_scalar, const uint64_t *has_at_least_one_set,
uint32_t num_scalars, uint32_t shift, int8_t *mem, void *const *bsks,
void *const *compute_bsks, void *const *ksks);
void *const *ksks);
void cleanup_cuda_integer_grouped_oprf_custom_range_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

View File

@@ -390,7 +390,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_a[6], &wires_a[15], &input_bits[7]);
XOR(&wires_a[10], &wires_a[15], &wires_b[0]);
XOR(&wires_a[11], &wires_a[20], &wires_a[9]);
FLUSH(&wires_a[6], &wires_a[10], &wires_a[11]);
FLUSH(&wires_a[6], &wires_a[10]);
XOR(&wires_a[7], &input_bits[7], &wires_a[11]);
FLUSH(&wires_a[7]);
XOR(&wires_a[17], &wires_a[10], &wires_a[11]);
@@ -426,7 +426,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[22], &wires_b[18], &wires_a[19]);
XOR(&wires_b[23], &wires_b[19], &wires_a[21]);
XOR(&wires_b[24], &wires_b[20], &wires_a[18]);
FLUSH(&wires_b[21], &wires_b[22], &wires_b[23], &wires_b[24]);
FLUSH(&wires_b[21], &wires_b[23], &wires_b[24]);
XOR(&wires_b[25], &wires_b[21], &wires_b[22]);
FLUSH(&wires_b[25]);
@@ -468,7 +468,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[37], &wires_b[36], &wires_b[34]);
XOR(&wires_b[38], &wires_b[27], &wires_b[36]);
FLUSH(&wires_b[38], &wires_b[37]);
FLUSH(&wires_b[38]);
XOR(&wires_b[44], &wires_b[33], &wires_b[37]);
CudaRadixCiphertextFFI *and_outs_6[] = {&wires_b[39]};
@@ -479,7 +479,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[40], &wires_b[25], &wires_b[39]);
XOR(&wires_b[41], &wires_b[40], &wires_b[37]);
XOR(&wires_b[43], &wires_b[29], &wires_b[40]);
FLUSH(&wires_b[41], &wires_b[40], &wires_b[43], &wires_b[44]);
FLUSH(&wires_b[41]);
XOR(&wires_b[45], &wires_b[42], &wires_b[41]);
FLUSH(&wires_b[45]);
@@ -514,7 +514,6 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[57], &wires_b[50], &wires_b[53]);
XOR(&wires_b[58], &wires_c[4], &wires_b[46]);
XOR(&wires_b[59], &wires_c[3], &wires_b[54]);
FLUSH(&wires_b[57], &wires_b[58]);
XOR(&wires_b[60], &wires_b[46], &wires_b[57]);
XOR(&wires_b[61], &wires_c[14], &wires_b[57]);
XOR(&wires_b[62], &wires_b[52], &wires_b[58]);
@@ -590,7 +589,6 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
#undef FLUSH
#undef AND
#undef ADD_ONE_FLUSH
#undef ADD_ONE
}
/**

View File

@@ -489,7 +489,7 @@ template <typename Torus>
__host__ void host_modulus_switch_multi_bit(
cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
check_cuda_error(cudaSetDevice(gpu_index));
cudaSetDevice(gpu_index);
int multibit_size = size / grouping_factor;
int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);

View File

@@ -72,13 +72,13 @@ void cuda_integer_grouped_oprf_custom_range_64_async(
uint32_t num_blocks_intermediate, const void *seeded_lwe_input,
const uint64_t *decomposed_scalar, const uint64_t *has_at_least_one_set,
uint32_t num_scalars, uint32_t shift, int8_t *mem, void *const *bsks,
void *const *compute_bsks, void *const *ksks) {
void *const *ksks) {
host_integer_grouped_oprf_custom_range<uint64_t>(
CudaStreams(streams), radix_lwe_out, num_blocks_intermediate,
(const uint64_t *)seeded_lwe_input, decomposed_scalar,
has_at_least_one_set, num_scalars, shift,
(int_grouped_oprf_custom_range_memory<uint64_t> *)mem, bsks, compute_bsks,
(int_grouped_oprf_custom_range_memory<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}

View File

@@ -114,7 +114,7 @@ void host_integer_grouped_oprf_custom_range(
const Torus *decomposed_scalar, const Torus *has_at_least_one_set,
uint32_t num_scalars, uint32_t shift,
int_grouped_oprf_custom_range_memory<Torus> *mem_ptr, void *const *bsks,
void *const *compute_bsks, Torus *const *ksks) {
Torus *const *ksks) {
CudaRadixCiphertextFFI *computation_buffer = mem_ptr->tmp_oprf_output;
set_zero_radix_ciphertext_slice_async<Torus>(
@@ -127,12 +127,12 @@ void host_integer_grouped_oprf_custom_range(
host_integer_scalar_mul_radix<Torus>(
streams, computation_buffer, decomposed_scalar, has_at_least_one_set,
mem_ptr->scalar_mul_buffer, compute_bsks, ksks,
mem_ptr->params.message_modulus, num_scalars);
mem_ptr->scalar_mul_buffer, bsks, ksks, mem_ptr->params.message_modulus,
num_scalars);
host_logical_scalar_shift_inplace<Torus>(
streams, computation_buffer, shift, mem_ptr->logical_scalar_shift_buffer,
compute_bsks, ksks, num_blocks_intermediate);
host_logical_scalar_shift_inplace<Torus>(streams, computation_buffer, shift,
mem_ptr->logical_scalar_shift_buffer,
bsks, ksks, num_blocks_intermediate);
uint32_t num_blocks_output = radix_lwe_out->num_radix_blocks;
uint32_t blocks_to_copy =

View File

@@ -308,7 +308,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
pbs_buffer);
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
}
// Noise tests variant of the 128-bit multi-bit PBS, restricted to

View File

@@ -0,0 +1,3 @@
#!/usr/bin/env bash
cat /etc/os-release | grep "\<NAME\>" | sed "s/NAME=\"//g" | sed "s/\"//g"

View File

@@ -1647,7 +1647,6 @@ unsafe extern "C" {
shift: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
compute_bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-hpu-backend"
version = "0.5.0"
version = "0.4.0"
edition = "2021"
license = "BSD-3-Clause-Clear"
description = "HPU implementation on FPGA of TFHE-rs primitives."
@@ -36,7 +36,7 @@ thiserror = "1.0.61"
bytemuck = { workspace = true }
anyhow = "1.0.82"
lazy_static = "1.4.0"
rand = "0.10.1"
rand = "0.8.5"
regex = "1.10.4"
bitflags = { version = "2.5.0", features = ["serde"] }
itertools = "0.11.0"

View File

@@ -24,7 +24,7 @@ use mem_alloc::{MemAlloc, MemChunk};
mod qdma;
use qdma::QdmaDriver;
use rand::RngExt;
use rand::Rng;
const DMA_XFER_ALIGN: usize = 4096_usize;
@@ -148,8 +148,8 @@ impl HpuHw {
tracing::debug!("Load stage1 through JTAG");
let pdi_stg1_tmp = format!(
"hpu_stg1_{}.pdi",
rand::rng()
.sample_iter(rand::distr::Alphanumeric)
rand::thread_rng()
.sample_iter(rand::distributions::Alphanumeric)
.take(5)
.map(char::from)
.collect::<String>()

View File

@@ -156,7 +156,7 @@ impl HpuVarWrapped {
{
let mut inner = var.inner.lock().unwrap();
for (slot, ct) in std::iter::zip(inner.bundle.iter_mut(), ct) {
for (slot, ct) in std::iter::zip(inner.bundle.iter_mut(), ct.into_iter()) {
#[cfg(feature = "io-dump")]
let params = ct.params().clone();
for (id, cut) in ct.into_container().iter().enumerate() {

View File

@@ -1,14 +1,5 @@
use std::path::PathBuf;
fn get_linux_distribution_name() -> Option<String> {
let content = std::fs::read_to_string("/etc/os-release").ok()?;
for line in content.lines() {
if let Some(value) = line.strip_prefix("NAME=") {
return Some(value.trim_matches('"').to_string());
}
}
None
}
use std::process::Command;
fn main() {
// Handle docs.rs builds (no CUDA available)
@@ -38,10 +29,16 @@ fn main() {
println!("cargo:rustc-link-arg=-Wl,--allow-multiple-definition");
println!("cargo:rustc-link-arg=-Wl,--no-as-needed");
// Check Linux distribution (reuse script from tfhe-cuda-backend)
let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")
.expect("CARGO_MANIFEST_DIR must be set by cargo during build");
if get_linux_distribution_name().as_deref() != Some("Ubuntu") {
let script_path = PathBuf::from(&manifest_dir).join("../tfhe-cuda-backend/get_os_name.sh");
let output = Command::new(&script_path)
.output()
.expect("Failed to run get_os_name.sh — is tfhe-cuda-backend present?");
let distribution =
String::from_utf8(output.stdout).expect("get_os_name.sh output must be valid UTF-8");
if distribution != "Ubuntu\n" {
println!(
"cargo:warning=This Linux distribution is not officially supported. \
Only Ubuntu is supported by zk-cuda-backend at this time. Build may fail\n"

View File

@@ -71,9 +71,14 @@ set(CMAKE_CUDA_FLAGS_DEBUG "-g -O0 -G")
# Additional CUDA flags (aligned with tfhe-cuda-backend)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -Xcompiler -Wextra --use_fast_math --expt-relaxed-constexpr")
# =============================================================================
# Path to tfhe-cuda-backend for device utilities
# =============================================================================
set(TFHE_CUDA_BACKEND_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../tfhe-cuda-backend/cuda)
# Core source files (without device utilities) Device utilities come from tfhe-cuda-backend.
set(FP_CORE_SOURCES src/primitives/fp.cu src/primitives/fp2.cu src/curve.cu src/msm/pippenger/msm_pippenger.cu
src/msm/msm.cu)
set(FP_CORE_SOURCES src/primitives/fp.cu src/primitives/fp2.cu src/primitives/xyzz.cu src/curve.cu
src/msm/pippenger/msm_pippenger.cu src/msm/msm.cu)
# Headers (common.cuh is a header, not a compiled source)
set(FP_MSM_HEADERS src/msm/common.cuh)
@@ -107,7 +112,7 @@ endif()
target_link_libraries(zk_cuda_backend PUBLIC cudart)
# Include both local headers and tfhe-cuda-backend headers (for device.h)
target_include_directories(zk_cuda_backend PUBLIC include ../src/include)
target_include_directories(zk_cuda_backend PUBLIC include ../src/include ${TFHE_CUDA_BACKEND_DIR}/include)
# =============================================================================
# Tests and Benchmarks (optional, controlled by ZK_CUDA_BACKEND_BUILD_TESTS/BENCHMARKS)
@@ -130,3 +135,4 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "C++ standard: ${CMAKE_CXX_STANDARD}")
message(STATUS "CUDA standard: ${CMAKE_CUDA_STANDARD}")
message(STATUS "tfhe-cuda-backend path: ${TFHE_CUDA_BACKEND_DIR}")

View File

@@ -1,35 +0,0 @@
#pragma once
#include <cstddef>
#include <cstdio>
#include "device.h"
// Variadic checked multiplication of size_t values.
// Folds left-to-right using __builtin_mul_overflow, returning true on overflow.
// On overflow the value written to *out is unspecified.
template <typename... Args>
inline bool checked_mul(size_t *out, size_t first, Args... rest) {
size_t result = first;
for (size_t value : {static_cast<size_t>(rest)...}) {
if (__builtin_mul_overflow(result, value, &result))
return true;
}
*out = result;
return false;
}
// Variadic safe multiplication: computes the product and panics on overflow.
template <typename... Args> inline size_t safe_mul(size_t first, Args... rest) {
size_t result;
bool overflow = checked_mul(&result, first, rest...);
PANIC_IF_FALSE(!overflow, "multiplication overflow wraps size_t");
return result;
}
// Variadic safe multiplication with an appended sizeof(T) factor.
// Computes (args... * sizeof(T)) with overflow checking.
template <typename T, typename... Args>
inline size_t safe_mul_sizeof(Args... args) {
return safe_mul(args..., sizeof(T));
}

View File

@@ -17,7 +17,13 @@ __host__ __device__ void fp2_zero(Fp2 &a);
// G1 point: (x, y) coordinates in Fp
// Curve equation: y^2 = x^3 + b (short Weierstrass form with a = 0)
struct G1Affine {
//
// alignas(sizeof(uint64_t)): The bool infinity field causes the struct to be
// padded to the largest field alignment (4 bytes in 32-bit limb mode, 8 bytes
// in 64-bit). Forcing alignment to sizeof(uint64_t) ensures
// sizeof(G1Affine)==120 in both modes, matching the Rust FFI bindings which
// are always generated from the 64-bit layout regardless of LIMB_BITS_CONFIG.
struct alignas(sizeof(uint64_t)) G1Affine {
Fp x;
Fp y;
bool infinity; // true if point at infinity (identity element)
@@ -36,7 +42,9 @@ struct G1Affine {
// G2 point: (x, y) coordinates in Fp2
// Curve equation: y^2 = x^3 + b' (twisted curve over Fp2)
struct G2Affine {
//
// alignas(sizeof(uint64_t)): same ABI-stability reason as G1Affine above.
struct alignas(sizeof(uint64_t)) G2Affine {
Fp2 x;
Fp2 y;
bool infinity; // true if point at infinity (identity element)

View File

@@ -1,145 +0,0 @@
#ifndef DEVICE_H
#define DEVICE_H
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
extern "C" {
#define check_cuda_error(ans) \
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
file, line);
std::abort();
}
}
// The PANIC macro should be used to validate user-inputs to GPU functions
// it will execute in all targets, including production settings
// e.g., cudaMemCopy to the device should check that the destination pointer is
// a device pointer
#define PANIC(format, ...) \
{ \
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
__LINE__, __func__, ##__VA_ARGS__); \
std::abort(); \
}
// This is a generic assertion checking macro with user defined printf-style
// message
#define PANIC_IF_FALSE(cond, format, ...) \
do { \
if (!(cond)) { \
PANIC(format "\n\n %s\n", ##__VA_ARGS__, #cond); \
} \
} while (0)
#ifndef GPU_ASSERTS_DISABLE
// The GPU assert should be used to validate assumptions in algorithms,
// for example, checking that two user-provided quantities have a certain
// relationship or that the size of the buffer provided to a function is
// sufficient when it is filled with some algorithm that depends on
// user-provided inputs e.g., OPRF corrections buffer should not have a size
// higher than the number of blocks in the datatype that is generated
#define GPU_ASSERT(cond, format, ...) \
PANIC_IF_FALSE(cond, format, ##__VA_ARGS__)
#else
#define GPU_ASSERT(cond) \
do { \
} while (0)
#endif
uint32_t cuda_get_device();
void cuda_set_device(uint32_t gpu_index);
cudaEvent_t cuda_create_event(uint32_t gpu_index);
void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
uint32_t gpu_index);
void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
uint32_t gpu_index);
void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index);
cudaStream_t cuda_create_stream(uint32_t gpu_index);
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
uint32_t cuda_is_available();
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
uint32_t gpu_index,
uint64_t &size_tracker,
bool allocate_gpu_memory);
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
uint64_t cuda_device_total_memory(uint32_t gpu_index);
void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
uint64_t size,
cudaStream_t stream,
uint32_t gpu_index,
bool gpu_memory_allocated);
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
void *dest, void const *src, uint64_t size, cudaStream_t stream,
uint32_t gpu_index, bool gpu_memory_allocated);
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
uint32_t gpu_index);
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
uint64_t size, cudaStream_t stream,
uint32_t gpu_index,
bool gpu_memory_allocated);
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
int cuda_get_number_of_gpus();
int cuda_get_number_of_sms();
void cuda_synchronize_device(uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
uint32_t gpu_index,
bool gpu_memory_allocated);
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
}
uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
uint32_t cuda_get_max_shared_memory_per_block(uint32_t gpu_index);
bool cuda_check_support_cooperative_groups();
bool cuda_check_support_thread_block_clusters();
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);
#endif

View File

@@ -18,7 +18,7 @@
// Supported values: 32, 64.
// ============================================================================
#ifndef LIMB_BITS_CONFIG
#define LIMB_BITS_CONFIG 64
#define LIMB_BITS_CONFIG 32
#endif
#if LIMB_BITS_CONFIG == 64
@@ -209,6 +209,17 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b);
// MONTGOMERY: Both inputs and output must be in Montgomery form
__host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b);
// Lazy addition: c = a + b, output in [0, 2p) for inputs in [0, p).
// Skips the final conditional subtraction of fp_add.
// Safe as input to fp_mont_mul (CIOS accepts [0, 2p)); NOT safe for final
// results or as input to fp_sub/fp_neg which require [0, p) inputs.
__host__ __device__ void fp_add_lazy(Fp &c, const Fp &a, const Fp &b);
// Lazy subtraction: c ≡ a - b (mod p), output in [0, 2p) for inputs in [0, p).
// Adds p unconditionally, skipping the borrow-select of fp_sub.
// Same safety concerns as fp_add_lazy.
__host__ __device__ void fp_sub_lazy(Fp &c, const Fp &a, const Fp &b);
// Multiplication: c = a * b (without reduction)
// "Raw" means the operation is performed without modular reduction modulo p.
// The result is stored in double-width (2*FP_LIMBS limbs) and may be >= p.
@@ -225,6 +236,11 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a);
// Both a and b are in Montgomery form, result is in Montgomery form
__host__ __device__ void fp_mont_mul(Fp &c, const Fp &a, const Fp &b);
// Montgomery squaring: c = (a^2 * R_INV) mod p
// Both input and output in Montgomery form.
// On device uses a triangular MAD chain (fewer multiplications).
__host__ __device__ void fp_mont_sqr(Fp &c, const Fp &a);
// CONVERSION: Input is normal form, output is Montgomery form
__host__ __device__ void fp_to_montgomery(Fp &c, const Fp &a);

View File

@@ -72,6 +72,11 @@ __host__ __device__ void fp2_add(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Subtraction: c = a - b
__host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Lazy add/sub: each component output in [0, 2p) for inputs in [0, p).
// Safe as input to fp2_mont_mul; same contract as fp_add_lazy / fp_sub_lazy.
__host__ __device__ void fp2_add_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);
__host__ __device__ void fp2_sub_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Multiplication: c = a * b
// (a0 + a1*i) * (b0 + b1*i) = (a0*b0 - a1*b1) + (a0*b1 + a1*b0)*i
// NOTE: Assumes inputs are in normal form and converts to/from Montgomery
@@ -84,7 +89,7 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);
// Montgomery squaring: c = a^2 (all in Montgomery form)
// Uses the complex-squaring identity: c0 = (a0+a1)(a0-a1), c1 = 2*a0*a1
// Only 2 Fp multiplications vs 3 for fp2_mont_mul(c, a, a).
// NOTE: All inputs and outputs are in Montgomery form (no conversions)
// NOTE: All inputs should be in Montgomery form
__host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a);
// Squaring: c = a^2

View File

@@ -1,16 +0,0 @@
#ifndef HELPER_PROFILE
#define HELPER_PROFILE
#ifdef USE_NVTOOLS
#include <nvtx3/nvToolsExt.h>
#endif
void cuda_nvtx_label_with_color(const char *name);
void cuda_nvtx_pop();
#define PUSH_RANGE(name) \
{ cuda_nvtx_label_with_color(name); }
#define POP_RANGE() \
{ cuda_nvtx_pop(); }
#endif

View File

@@ -3,6 +3,7 @@
#include "curve.h"
#include "fp.h"
#include "fp2.h"
#include "xyzz.h"
// ============================================================================
// Unified Trait System for Elliptic Curve Points
@@ -276,3 +277,65 @@ template <> struct SelectorChooser<G1Projective> {
template <> struct SelectorChooser<G2Projective> {
using Selection = Projective<G2Projective>;
};
// XYZZ<T>: trait for XYZZ extended Jacobian operations (used in MSM)
template <typename XYZZType> struct XYZZ;
template <> struct XYZZ<G1XYZZ> {
using FieldType = Fp;
using AffineType = G1Affine;
using ProjectiveType = G1Projective;
__host__ __device__ static void point_at_infinity(G1XYZZ &p) {
xyzz_infinity(p);
}
__host__ __device__ static bool is_infinity(const G1XYZZ &p) {
return xyzz_is_infinity(p);
}
__host__ __device__ static void from_affine(G1XYZZ &xyzz,
const G1Affine &affine) {
xyzz_from_affine(xyzz, affine);
}
__host__ __device__ static void mixed_add(G1XYZZ &acc, const G1Affine &p) {
xyzz_mixed_add(acc, p);
}
__host__ __device__ static void to_projective(G1Projective &proj,
const G1XYZZ &xyzz) {
xyzz_to_projective(proj, xyzz);
}
};
template <> struct XYZZ<G2XYZZ> {
using FieldType = Fp2;
using AffineType = G2Affine;
using ProjectiveType = G2Projective;
__host__ __device__ static void point_at_infinity(G2XYZZ &p) {
xyzz_infinity(p);
}
__host__ __device__ static bool is_infinity(const G2XYZZ &p) {
return xyzz_is_infinity(p);
}
__host__ __device__ static void from_affine(G2XYZZ &xyzz,
const G2Affine &affine) {
xyzz_from_affine(xyzz, affine);
}
__host__ __device__ static void mixed_add(G2XYZZ &acc, const G2Affine &p) {
xyzz_mixed_add(acc, p);
}
__host__ __device__ static void to_projective(G2Projective &proj,
const G2XYZZ &xyzz) {
xyzz_to_projective(proj, xyzz);
}
};
// XYZZFor<ProjectiveType>: maps a projective type to its XYZZ accumulator type
template <typename ProjectiveType> struct XYZZFor;
template <> struct XYZZFor<G1Projective> {
using Type = G1XYZZ;
};
template <> struct XYZZFor<G2Projective> {
using Type = G2XYZZ;
};

View File

@@ -0,0 +1,58 @@
#pragma once
#include "curve.h"
#include "fp.h"
#include "fp2.h"
// XYZZ Extended Jacobian Coordinates for BLS12-446
// G1 XYZZ point: (X, Y, ZZ, ZZZ) in Fp
struct G1XYZZ {
Fp X;
Fp Y;
Fp ZZ;
Fp ZZZ;
// Default constructor: initializes to point at infinity (ZZ=ZZZ=0)
__host__ __device__ G1XYZZ() {
fp_zero(X);
fp_zero(Y);
fp_zero(ZZ);
fp_zero(ZZZ);
}
};
// G2 XYZZ point: (X, Y, ZZ, ZZZ) in Fp2
struct G2XYZZ {
Fp2 X;
Fp2 Y;
Fp2 ZZ;
Fp2 ZZZ;
// Default constructor: initializes to point at infinity (ZZ=ZZZ=0)
__host__ __device__ G2XYZZ() {
fp2_zero(X);
fp2_zero(Y);
fp2_zero(ZZ);
fp2_zero(ZZZ);
}
};
// Initialize XYZZ from an affine point: X=x, Y=y, ZZ=ZZZ=1 (Montgomery form)
__host__ __device__ void xyzz_from_affine(G1XYZZ &xyzz, const G1Affine &affine);
__host__ __device__ void xyzz_from_affine(G2XYZZ &xyzz, const G2Affine &affine);
// Set XYZZ to the point at infinity: ZZ=ZZZ=0 (X,Y left undefined)
__host__ __device__ void xyzz_infinity(G1XYZZ &p);
__host__ __device__ void xyzz_infinity(G2XYZZ &p);
__host__ __device__ bool xyzz_is_infinity(const G1XYZZ &p);
__host__ __device__ bool xyzz_is_infinity(const G2XYZZ &p);
__host__ __device__ void xyzz_mixed_add(G1XYZZ &acc, const G1Affine &p);
__host__ __device__ void xyzz_mixed_add(G2XYZZ &acc, const G2Affine &p);
__host__ __device__ void xyzz_to_projective(G1Projective &proj,
const G1XYZZ &xyzz);
__host__ __device__ void xyzz_to_projective(G2Projective &proj,
const G2XYZZ &xyzz);

View File

@@ -1413,7 +1413,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
u = Y2Z1 - Y1Z2;
// uu = u^2
fp_mont_mul(uu, u, u);
fp_mont_sqr(uu, u);
// v = X2 * Z1 - X1 * Z2 = X2*Z1 - X1Z2
Fp X2Z1;
@@ -1428,7 +1428,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
}
// vv = v^2
fp_mont_mul(vv, v, v);
fp_mont_sqr(vv, v);
// vvv = v * vv
fp_mont_mul(vvv, v, vv);
@@ -1568,9 +1568,9 @@ __host__ __device__ void projective_mixed_add(G1Projective &result,
}
// uu = u^2
fp_mont_mul(uu, u, u);
fp_mont_sqr(uu, u);
// vv = v^2
fp_mont_mul(vv, v, v);
fp_mont_sqr(vv, v);
// vvv = v * vv
fp_mont_mul(vvv, v, vv);
@@ -1692,7 +1692,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,
// A = 3 * X^2
Fp X_sq, A;
fp_mont_mul(X_sq, p.X, p.X);
fp_mont_sqr(X_sq, p.X);
fp_mul3(A, X_sq);
// B = Y * Z
@@ -1706,7 +1706,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,
// D = A^2 - 8*C
Fp A_sq, eight_C;
fp_mont_mul(A_sq, A, A);
fp_mont_sqr(A_sq, A);
fp_mul8(eight_C, C);
Fp D = A_sq - eight_C;
@@ -1716,14 +1716,16 @@ __host__ __device__ void projective_point_double(G1Projective &result,
fp_double(result.X, BD);
// Y3 = A * (4*C - D) - 8 * Y^2 * B^2
Fp four_C, A_times_diff;
Fp four_C, four_C_minus_D, A_times_diff;
fp_mul4(four_C, C);
Fp four_C_minus_D = four_C - D;
// Lazy sub: four_C_minus_D feeds fp_mont_mul, so skip the conditional
// subtract and output in [0, 2p) instead of [0, p).
fp_sub_lazy(four_C_minus_D, four_C, D);
fp_mont_mul(A_times_diff, A, four_C_minus_D);
Fp Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
fp_mont_mul(Y_sq, p.Y, p.Y);
fp_mont_mul(B_sq, B, B);
fp_mont_sqr(Y_sq, p.Y);
fp_mont_sqr(B_sq, B);
fp_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
fp_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
result.Y = A_times_diff - eight_Y_sq_B_sq;
@@ -1773,9 +1775,13 @@ __host__ __device__ void projective_point_double(G2Projective &result,
fp2_double(result.X, BD);
// Y3 = A * (4*C - D) - 8 * Y^2 * B^2
Fp2 four_C, A_times_diff;
Fp2 four_C, four_C_minus_D, A_times_diff;
fp2_mul4(four_C, C);
Fp2 four_C_minus_D = four_C - D;
// we can't use lazy sub here because for fp2 with Karatsuba path we will end
// up with values in [0, 4p) instead of [0, 2p), which would break the final
// result
fp2_sub(four_C_minus_D, four_C, D);
fp2_mont_mul(A_times_diff, A, four_C_minus_D);
Fp2 Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;

View File

@@ -1,43 +0,0 @@
#include "helper_profile.cuh"
#include <stdint.h>
uint32_t adler32(const unsigned char *data) {
const uint32_t MOD_ADLER = 65521;
uint32_t a = 1, b = 0;
size_t index;
for (index = 0; data[index] != 0; ++index) {
a = (a + data[index] * 2) % MOD_ADLER;
b = (b + a) % MOD_ADLER;
}
return (b << 16) | a;
}
void cuda_nvtx_label_with_color(const char *name) {
#ifdef USE_NVTOOLS
int color_id = adler32((const unsigned char *)name);
int r, g, b;
r = color_id & 0x000000ff;
g = (color_id & 0x000ff000) >> 12;
b = (color_id & 0x0ff00000) >> 20;
if (r < 64 & g < 64 & b < 64) {
r = r * 3;
g = g * 3 + 64;
b = b * 4;
}
color_id = 0xff000000 | (r << 16) | (g << 8) | (b);
nvtxEventAttributes_t eventAttrib = {0};
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = color_id;
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = name;
nvtxRangePushEx(&eventAttrib);
#endif
}
void cuda_nvtx_pop() {
#ifdef USE_NVTOOLS
nvtxRangePop();
#endif
}

View File

@@ -22,20 +22,20 @@ template <typename AffineType> struct Phase1KernelLaunchParams {
Phase1KernelLaunchParams(uint32_t n, uint32_t requested_threads_per_block,
uint32_t bucket_count, uint32_t gpu_index) {
// Shared memory layout:
// - bucket_counts: [bucket_count] * sizeof(uint32_t)
// - bucket_counts: [bucket_count] * sizeof(uint32_t)
// - bucket_offsets: [bucket_count] * sizeof(uint32_t)
// - sorted_points: [threads_per_block] * sizeof(AffineType)
// - sorted_buckets: [threads_per_block] * sizeof(uint32_t)
constexpr size_t per_thread_shared_mem =
sizeof(AffineType) + sizeof(uint32_t); // sorted_points + sorted_buckets
const size_t fixed_shared_mem =
2 * bucket_count * sizeof(uint32_t); // bucket_counts + bucket_offsets
// - sorted_points: [threads_per_block] * sizeof(AffineType)
//
// sorted_points starts at 2*bucket_count uint32_t slots. Since
// bucket_count = 2^(c-1)+1 (always odd), 2*bucket_count is always even,
// so the byte offset 2*bucket_count*4 is always a multiple of 8.
// No alignment padding is needed.
const size_t fixed_shared_mem = 2 * bucket_count * sizeof(uint32_t);
constexpr size_t per_thread_shared_mem = sizeof(AffineType);
// Query the actual per-block shared memory limit from the device
const uint32_t max_shared_mem_per_block =
cuda_get_max_shared_memory_per_block(gpu_index);
// Calculate maximum threads that fit within shared memory limit
const size_t available_shared_mem =
(max_shared_mem_per_block > fixed_shared_mem)
? (max_shared_mem_per_block - fixed_shared_mem)
@@ -43,7 +43,6 @@ template <typename AffineType> struct Phase1KernelLaunchParams {
const uint32_t max_threads_for_shared_mem =
available_shared_mem / per_thread_shared_mem;
// Cap threads_per_block to respect shared memory limit
adjusted_threads_per_block =
std::min(requested_threads_per_block, max_threads_for_shared_mem);
@@ -52,10 +51,7 @@ template <typename AffineType> struct Phase1KernelLaunchParams {
"kernel launch (max_shared=%u, fixed=%zu)",
max_shared_mem_per_block, fixed_shared_mem);
// Calculate number of blocks per window
num_blocks_per_window = CEIL_DIV(n, adjusted_threads_per_block);
// Calculate actual shared memory requirement
accum_shared_mem =
fixed_shared_mem + adjusted_threads_per_block * per_thread_shared_mem;
}
@@ -67,15 +63,12 @@ template <typename ProjectiveType> struct Phase2KernelLaunchParams {
size_t shared_mem;
Phase2KernelLaunchParams(uint32_t requested_threads, uint32_t gpu_index) {
// Query the actual per-block shared memory limit from the device
const uint32_t max_shared_mem_per_block =
cuda_get_max_shared_memory_per_block(gpu_index);
// Calculate maximum threads that fit within shared memory limit
const uint32_t max_threads_for_shared =
max_shared_mem_per_block / sizeof(ProjectiveType);
// Cap threads to respect shared memory limit
uint32_t threads = std::min(requested_threads, max_threads_for_shared);
threads = std::min(threads, static_cast<uint32_t>(KERNEL_THREADS_MAX));
@@ -84,15 +77,11 @@ template <typename ProjectiveType> struct Phase2KernelLaunchParams {
while (pow2_threads < threads)
pow2_threads *= 2;
// After rounding to power of 2, verify shared memory doesn't exceed device
// limit
if (safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(pow2_threads)) >
max_shared_mem_per_block) {
pow2_threads /= 2;
}
adjusted_threads = pow2_threads;
// Calculate actual shared memory requirement
shared_mem =
safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(adjusted_threads));
}
@@ -153,19 +142,96 @@ __device__ __forceinline__ uint32_t extract_window_bigint(
window_size);
}
// Kernel: Accumulate ALL windows in parallel using SORT-THEN-REDUCE
// Grid: (num_windows * num_blocks_per_window) blocks
// Each block processes points for ONE window
// Uses counting sort by bucket, then parallel tree reduction per bucket
// Uses mixed addition (affine + projective) to save 3 field muls per add
// ============================================================================
// Preprocessing kernel: scalar → signed-digit representation
// ============================================================================
//
// Converts each scalar into balanced signed-digit form before the main MSM,
// eliminating any need for a correction term.
//
// For each window w (LSB-first, w=0 = least significant), with carry from the
// previous window:
//
// effective = raw_digit + carry
// if effective > half: digit = effective - 2^c (negative), carry = 1
// else: digit = effective (zero or positive), carry
// = 0
//
// where half = 2^(c-1), c = window_size.
//
// Result: digit ∈ {-(half-1), …, half}, so |digit| ≤ half = bucket_count - 1.
// Positive digit → add point P to bucket[digit].
// Negative digit → add -P (Y-negated) to bucket[|digit|].
// Zero digit → skip.
//
// Output layout: d_signed_digits[window_idx * n + point_idx], where window_idx
// is in Horner (MSB-first) order so the main kernel indexes it directly.
// This layout is column-major in window_idx: threads in a warp (consecutive
// point_idx) access the same window row → coalesced reads in the main kernel.
//
// num_windows is set to (scalar_bits + window_size) / window_size so there is
// always at least one partial or empty window at the top to absorb any carry
// propagated out of the last full window.
__global__ void kernel_preprocess_signed_digits(
int8_t *__restrict__ d_signed_digits, // [num_windows * n], Horner-ordered
const Scalar *__restrict__ d_scalars, uint32_t n, uint32_t num_windows,
uint32_t window_size) {
const uint32_t point_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (point_idx >= n)
return;
const uint32_t half = 1u << (window_size - 1);
const uint32_t full = 1u << window_size;
const Scalar &scalar = d_scalars[point_idx];
uint32_t carry = 0;
// Iterate windows LSB-first so carry flows correctly from low to high.
for (uint32_t w = 0; w < num_windows; w++) {
uint32_t raw = extract_window_bigint(scalar, w, window_size);
uint32_t effective = raw + carry;
carry = 0;
int8_t sd;
if (effective > half) {
carry = 1;
sd = -(int8_t)(full - effective); // negative balanced digit
} else {
sd = (int8_t)effective; // zero or positive digit
}
// Remap LSB-first index w to MSB-first kernel window_idx:
// window_idx = num_windows - 1 - w
d_signed_digits[(num_windows - 1 - w) * (size_t)n + point_idx] = sd;
}
// carry == 0 guaranteed: the extra top window absorbs any final carry.
}
// ============================================================================
// Phase 1: Accumulate all windows in parallel using sort-then-reduce
// ============================================================================
//
// Grid: (num_windows * num_blocks_per_window) blocks.
// Each block processes a slice of points for ONE window.
//
// The signed digit for each point has already been computed by
// kernel_preprocess_signed_digits. Negative digits mean the point's Y was
// pre-negated in the preprocessing step — here we just negate it inline before
// scattering, which is cheaper than reading a separate flag array.
//
// Shared memory layout (no sorted_buckets — not needed without correction):
// [bucket_counts (bc)] [bucket_offsets (bc)] [sorted_points (blockDim.x)]
template <typename AffineType, typename ProjectiveType>
__global__ void kernel_accumulate_all_windows(
ProjectiveType *__restrict__ all_block_buckets, // [num_windows * num_blocks
// * bucket_count]
const AffineType *__restrict__ points, const Scalar *__restrict__ scalars,
const AffineType *__restrict__ points,
const int8_t *__restrict__ d_signed_digits, // [num_windows * n]
uint32_t num_points, uint32_t num_windows, uint32_t num_blocks_per_window,
uint32_t window_size, uint32_t bucket_count) {
uint32_t bucket_count) {
using ProjectivePoint = Projective<ProjectiveType>;
using XYZZType = typename XYZZFor<ProjectiveType>::Type;
using XYZZPoint = XYZZ<XYZZType>;
const uint32_t window_idx = blockIdx.x / num_blocks_per_window;
const uint32_t block_within_window = blockIdx.x % num_blocks_per_window;
@@ -173,115 +239,89 @@ __global__ void kernel_accumulate_all_windows(
if (window_idx >= num_windows)
return;
// Output offset for this block's buckets
uint32_t bucket_offset =
(window_idx * num_blocks_per_window + block_within_window) * bucket_count;
ProjectiveType *my_buckets = all_block_buckets + bucket_offset;
// Shared memory layout (register-based optimization):
// - bucket_counts: [bucket_count] for counting sort
// - bucket_offsets: [bucket_count] for prefix sums
// - sorted_points: [blockDim.x] for sorted points (AFFINE - smaller!)
// - sorted_buckets: [blockDim.x] for sorted bucket indices
// NOTE: shared_buckets removed - using register-based accumulation instead
// Shared memory: [bucket_counts][bucket_offsets][sorted_points]
extern __shared__ char shared_mem[];
auto *bucket_counts_arr = reinterpret_cast<uint32_t *>(shared_mem);
auto *bucket_offsets = bucket_counts_arr + bucket_count;
// Store affine points instead of projective - saves shared memory
auto *bucket_offsets_arr = bucket_counts_arr + bucket_count;
auto *sorted_points =
reinterpret_cast<AffineType *>(bucket_offsets + bucket_count);
auto *sorted_buckets =
reinterpret_cast<uint32_t *>(sorted_points + blockDim.x);
reinterpret_cast<AffineType *>(bucket_offsets_arr + bucket_count);
// Initialize bucket counts
if (threadIdx.x < bucket_count) {
bucket_counts_arr[threadIdx.x] = 0;
}
__syncthreads();
// Each thread loads its affine point and computes bucket index
// No conversion to projective here - we keep points affine
uint32_t point_idx = threadIdx.x + block_within_window * blockDim.x;
// Each thread reads its signed digit and loads its affine point.
// Negative digit: negate Y and use |digit| as bucket index.
// Zero digit: skip (my_bucket = 0).
const uint32_t point_idx = threadIdx.x + block_within_window * blockDim.x;
AffineType my_point;
uint32_t my_bucket = 0;
bool valid = point_idx < num_points;
const bool valid = point_idx < num_points;
if (valid) {
uint32_t scalar_window = num_windows - 1 - window_idx;
my_bucket =
extract_window_bigint(scalars[point_idx], scalar_window, window_size);
my_point = points[point_idx]; // Keep as affine!
const int8_t sd =
d_signed_digits[window_idx * (size_t)num_points + point_idx];
my_point = points[point_idx];
if (sd < 0) {
my_point.y = -my_point.y; // negate Y for negative digit
my_bucket = (uint32_t)(-sd);
} else {
my_bucket = (uint32_t)sd; // 0 means skip
}
}
// Count points per bucket (atomic within block)
if (valid && my_bucket > 0) {
atomicAdd(&bucket_counts_arr[my_bucket], 1);
}
__syncthreads();
// Compute prefix sums for bucket offsets
// Thread 0 computes prefix sums (bucket start offsets).
if (threadIdx.x == 0) {
uint32_t offset = 0;
for (uint32_t b = 0; b < bucket_count; b++) {
bucket_offsets[b] = offset;
bucket_offsets_arr[b] = offset;
offset += bucket_counts_arr[b];
bucket_counts_arr[b] = 0; // Reset for scatter phase
bucket_counts_arr[b] = 0; // reset to zero for use as a scatter counter
}
}
__syncthreads();
// Scatter affine points to sorted positions
// Scatter: each thread writes its (possibly Y-negated) point into the sorted
// position for its bucket.
if (valid && my_bucket > 0) {
uint32_t pos =
bucket_offsets[my_bucket] + atomicAdd(&bucket_counts_arr[my_bucket], 1);
sorted_points[pos] = my_point; // Store affine point directly
sorted_buckets[pos] = my_bucket;
uint32_t pos = bucket_offsets_arr[my_bucket] +
atomicAdd(&bucket_counts_arr[my_bucket], 1);
sorted_points[pos] = my_point;
}
__syncthreads();
// Parallel tree reduction within each bucket using MIXED ADDITION
// Each thread is assigned to reduce points in one bucket
// REGISTER-BASED: Accumulate in registers, write directly to global memory
// Bucket reduction: each thread owns one or more buckets (stride by
// blockDim.x). Points for bucket b occupy sorted_points[start..start+count].
for (uint32_t bucket = threadIdx.x + 1; bucket < bucket_count;
bucket += blockDim.x) {
uint32_t start = bucket_offsets[bucket];
uint32_t count = bucket_counts_arr[bucket];
const uint32_t start = bucket_offsets_arr[bucket];
const uint32_t count = bucket_counts_arr[bucket];
if (count == 0) {
// Empty bucket - write infinity point
ProjectivePoint::point_at_infinity(my_buckets[bucket]);
continue;
}
// Tree reduction for this bucket using mixed addition
// Accumulate in registers (compiler will optimize this)
ProjectiveType sum;
// Initialize sum from first affine point
const AffineType &first_point = sorted_points[start];
if (first_point.infinity) {
ProjectivePoint::point_at_infinity(sum);
} else {
ProjectivePoint::affine_to_projective(sum, first_point);
XYZZType sum;
XYZZPoint::point_at_infinity(sum);
for (uint32_t i = 0; i < count; i++) {
XYZZPoint::mixed_add(sum, sorted_points[start + i]);
}
// Use mixed addition for remaining points (saves 3 muls per add!)
for (uint32_t i = 1; i < count; i++) {
const AffineType &pt = sorted_points[start + i];
if (!pt.infinity) {
if (ProjectivePoint::is_infinity(sum)) {
ProjectivePoint::affine_to_projective(sum, pt);
} else {
ProjectiveType temp;
// MIXED ADDITION: projective + affine (saves 3 field muls)
ProjectivePoint::mixed_add(temp, sum, pt);
ProjectivePoint::point_copy(sum, temp);
}
}
}
// Write directly from registers to global memory (no shared memory
// intermediate)
ProjectivePoint::point_copy(my_buckets[bucket], sum);
ProjectiveType proj;
XYZZPoint::to_projective(proj, sum);
ProjectivePoint::point_copy(my_buckets[bucket], proj);
}
}
@@ -349,7 +389,6 @@ __global__ void kernel_reduce_all_windows(
__syncthreads();
}
// Thread 0 writes final bucket value
if (threadIdx.x == 0) {
uint32_t out_idx = window_idx * num_buckets + bucket_idx;
ProjectivePoint::point_copy(all_final_buckets[out_idx], shared_sums[0]);
@@ -358,7 +397,7 @@ __global__ void kernel_reduce_all_windows(
// Kernel: Compute window sums for ALL windows in parallel
// Grid: num_windows blocks
// Each block computes the window sum: sum(i * bucket[i]) for i=1..15
// Each block computes the window sum: sum(i * bucket[i]) for i=1..n
template <typename ProjectiveType>
__global__ void kernel_compute_window_sums(
ProjectiveType *__restrict__ window_sums, // [num_windows]
@@ -427,7 +466,6 @@ __global__ void kernel_compute_window_sums(
break;
}
// Thread 0 writes window sum
if (tid == 0) {
ProjectivePoint::point_copy(window_sums[window_idx], work[0]);
}
@@ -454,7 +492,6 @@ void horner_combine_cpu(ProjectiveType &result,
ProjectiveType acc;
ProjectivePoint::point_at_infinity(acc);
// Process from MSB (window 0) to LSB (window num_windows-1)
for (uint32_t w = 0; w < num_windows; w++) {
const ProjectiveType &ws = window_sums[w];
ProjectiveType temp;
@@ -463,7 +500,6 @@ void horner_combine_cpu(ProjectiveType &result,
if (ProjectivePoint::is_infinity(acc)) {
ProjectivePoint::point_copy(acc, ws);
} else {
// acc = acc * 2^window_size + ws
for (uint32_t i = 0; i < window_size; i++) {
ProjectivePoint::projective_double(temp, acc);
ProjectivePoint::point_copy(acc, temp);
@@ -472,7 +508,6 @@ void horner_combine_cpu(ProjectiveType &result,
ProjectivePoint::point_copy(acc, temp);
}
} else if (!ProjectivePoint::is_infinity(acc)) {
// Window sum is infinity but accumulator is not -- still shift left
for (uint32_t i = 0; i < window_size; i++) {
ProjectivePoint::projective_double(temp, acc);
ProjectivePoint::point_copy(acc, temp);
@@ -492,6 +527,14 @@ void horner_combine_cpu(ProjectiveType &result,
// d_scratch: caller-provided device buffer for intermediate bucket arrays and
// window sums. The caller is responsible for allocating and freeing this
// buffer.
//
// Scratch layout (all ProjectiveType elements):
// d_all_block_buckets [num_windows * num_blocks * bucket_count]
// d_all_final_buckets [num_windows * bucket_count]
// d_window_sums [num_windows]
//
// d_signed_digits is allocated internally (stream-ordered) and freed before
// the host memcpy, so it does not appear in the caller's scratch buffer.
template <typename AffineType, typename ProjectiveType>
void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
ProjectiveType *h_result,
@@ -513,35 +556,29 @@ void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
cuda_set_device(gpu_index);
// Calculate number of windows based on scalar bit width
// Compute number of windows. We use (scalar_bits + window_size) / window_size
// instead of the usual ceil formula so that there is always at least one
// partial window at the top. This guarantees the preprocessing kernel's carry
// propagation never overflows the digit array, regardless of window size.
const uint32_t total_bits = Scalar::NUM_BITS;
const uint32_t num_windows = CEIL_DIV(total_bits, window_size);
const uint32_t num_windows = (total_bits + window_size) / window_size;
// Calculate kernel launch parameters respecting shared memory limits
Phase1KernelLaunchParams<AffineType> launch_params(n, threads_per_block,
bucket_count, gpu_index);
// Scratch space layout for ALL-WINDOWS-PARALLEL:
// - all_block_buckets: [num_windows * num_blocks * bucket_count]
// - all_final_buckets: [num_windows * bucket_count]
// - window_sums: [num_windows]
// Compute element counts in size_t (64-bit) so that intermediate products
// of uint32_t inputs don't silently wrap at 2^32 before reaching the
// explicit overflow check below (which multiplies by sizeof(ProjectiveType))
const size_t all_block_buckets_size = static_cast<size_t>(num_windows) *
launch_params.num_blocks_per_window *
bucket_count;
// Scratch layout
const size_t num_blocks = launch_params.num_blocks_per_window;
const size_t all_block_buckets_size =
static_cast<size_t>(num_windows) * num_blocks * bucket_count;
const size_t all_final_buckets_size =
static_cast<size_t>(num_windows) * bucket_count;
const size_t total_scratch =
all_block_buckets_size + all_final_buckets_size + num_windows;
// Partition the caller-provided scratch buffer into sub-regions
ProjectiveType *d_all_block_buckets = d_scratch;
ProjectiveType *d_all_final_buckets = d_scratch + all_block_buckets_size;
ProjectiveType *d_window_sums = d_all_final_buckets + all_final_buckets_size;
// Clear all scratch space
const uint32_t clear_blocks = CEIL_DIV(total_scratch, KERNEL_THREADS_MAX);
PANIC_IF_FALSE(clear_blocks * KERNEL_THREADS_MAX >= total_scratch,
"kernel_clear_buckets: insufficient threads (%zu) to clear "
@@ -553,11 +590,27 @@ void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
total_scratch);
check_cuda_error(cudaGetLastError());
// Phase 1: Accumulate ALL windows in parallel (SINGLE kernel launch!)
// Preprocessing: convert scalars to signed-digit form.
// Allocated stream-ordered; freed before the CPU phase once Phase 1 is done.
int8_t *d_signed_digits = nullptr;
const size_t signed_digits_bytes =
static_cast<size_t>(num_windows) * n * sizeof(int8_t);
cudaMallocAsync(&d_signed_digits, signed_digits_bytes, stream);
check_cuda_error(cudaGetLastError());
constexpr uint32_t preprocess_threads = 128;
const uint32_t preprocess_blocks = CEIL_DIV(n, preprocess_threads);
kernel_preprocess_signed_digits<<<preprocess_blocks, preprocess_threads, 0,
stream>>>(d_signed_digits, d_scalars, n,
num_windows, window_size);
check_cuda_error(cudaGetLastError());
// Phase 1: Accumulate ALL windows in parallel.
const uint32_t total_accum_blocks =
num_windows * launch_params.num_blocks_per_window;
PANIC_IF_FALSE(
total_accum_blocks * bucket_count <= all_block_buckets_size,
static_cast<size_t>(total_accum_blocks) * bucket_count <=
all_block_buckets_size,
"kernel_accumulate_all_windows: max write index (%zu) exceeds buffer "
"(%zu)",
static_cast<size_t>(total_accum_blocks) * bucket_count,
@@ -565,11 +618,15 @@ void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
kernel_accumulate_all_windows<AffineType, ProjectiveType>
<<<total_accum_blocks, launch_params.adjusted_threads_per_block,
launch_params.accum_shared_mem, stream>>>(
d_all_block_buckets, d_points, d_scalars, n, num_windows,
launch_params.num_blocks_per_window, window_size, bucket_count);
d_all_block_buckets, d_points, d_signed_digits, n, num_windows,
launch_params.num_blocks_per_window, bucket_count);
check_cuda_error(cudaGetLastError());
// Phase 2: Reduce ALL windows' buckets in parallel (SINGLE kernel launch!)
// d_signed_digits is no longer needed after Phase 1.
cudaFreeAsync(d_signed_digits, stream);
check_cuda_error(cudaGetLastError());
// Phase 2: Reduce ALL windows' buckets in parallel.
const uint32_t total_reduce_blocks = num_windows * bucket_count;
Phase2KernelLaunchParams<ProjectiveType> reduce_params(
launch_params.num_blocks_per_window, gpu_index);
@@ -584,9 +641,7 @@ void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
launch_params.num_blocks_per_window, bucket_count);
check_cuda_error(cudaGetLastError());
// Phase 3: Compute window sums in parallel (SINGLE kernel launch!)
// Round up to next multiple of 32 (warp size) for efficient scheduling.
// The kernel already has `if (tid < n)` bounds checks for the excess threads.
// Phase 3: Compute window sums in parallel.
const uint32_t combine_threads = ((bucket_count - 1) + 31) & ~31u;
const size_t combine_shared_mem =
safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(combine_threads));
@@ -600,11 +655,7 @@ void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
d_window_sums, d_all_final_buckets, num_windows, bucket_count);
check_cuda_error(cudaGetLastError());
// Phase 4: CPU Horner combine, result written directly to host pointer
//
// The Horner loop is inherently sequential. A single CPU core is much faster
// than a single GPU thread for this workload, so we run Horner on the CPU
// and write the result directly to the caller's host pointer.
// Phase 4: CPU Horner combine, result written directly to host pointer.
std::vector<ProjectiveType> h_window_sums(num_windows);
cuda_memcpy_async_to_cpu(
h_window_sums.data(), d_window_sums,
@@ -619,32 +670,48 @@ void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
// Dynamic Window Size Selection
// ============================================================================
// Select optimal window size for G1 MSM based on input count
// Trade-off: larger windows = fewer Horner doublings but more bucket work
// Optimal window size grows with log(n) approximately
// Select optimal window size for G1 MSM based on input count.
//
// Signed-digit preprocessing keeps the same window size as unsigned Pippenger
// but halves the bucket count: bucket_count = 2^(c-1) + 1 instead of 2^c.
// Fewer buckets speed up Phase 2 (cross-block reduce) and Phase 3 (window sum
// suffix-scan), with no correction term overhead.
//
// n = bucket_count - 1 must be a power of 2 for kernel_compute_window_sums:
// c=4 → half=8, bc=9, n=8=2^3 ✓
// c=5 → half=16, bc=17, n=16=2^4 ✓
// c=6 → half=32, bc=33, n=32=2^5 ✓
inline void get_g1_window_params(uint32_t n, uint32_t &window_size,
uint32_t &bucket_count) {
// Signed-digit: same c as original unsigned Pippenger, but bucket_count
// = 2^(c-1)+1 instead of 2^c. This halves Phase 2/3 work at no extra cost.
if (n <= MSM_G1_SMALL_THRESHOLD) {
window_size = 4;
bucket_count = (1u << 4); // 2^window_size
bucket_count = (1u << 3) + 1; // 9 = 2^3+1
} else if (n <= MSM_G1_MEDIUM_THRESHOLD) {
window_size = 5;
bucket_count = (1u << 5);
bucket_count = (1u << 4) + 1; // 17 = 2^4+1
} else {
window_size = 6;
bucket_count = (1u << 6);
bucket_count = (1u << 5) + 1; // 33 = 2^5+1
}
}
// Select optimal window size for G2 MSM based on input count
// G2 has 2x more expensive field ops, but empirical testing shows
// that the 5-bit fixed window size is optimal - larger windows cause
// too much bucket overhead that exceeds the Horner doubling savings
// Select optimal window size for G2 MSM.
//
// G2 Phase 1 is memory-bandwidth bound: only 1 block fits per SM (shared mem
// limit). With c=5, bc=17 only 16 threads are active in the bucket reduce
// (half-warp), causing regression vs the original bc=32 (31 active).
//
// c=6, bc=33 gives exactly 32 active threads (full first warp, same as c=6
// G1), keeps Phase 1 cost identical to the original bc=32, and reduces windows
// from 64 → 54 (15.6% fewer). 54×6=324 > 320 so the last window is partial
// and carry never overflows — no extra window needed.
inline void get_g2_window_params(uint32_t n, uint32_t &window_size,
uint32_t &bucket_count) {
(void)n; // Fixed window size works best for G2
window_size = MSM_G2_WINDOW_SIZE; // 5-bit windows
bucket_count = MSM_G2_BUCKET_COUNT; // 32 buckets
(void)n;
window_size = 6;
bucket_count = (1u << 5) + 1; // 33 = 2^5+1
}
// ============================================================================
@@ -653,19 +720,18 @@ inline void get_g2_window_params(uint32_t n, uint32_t &window_size,
// Computes the exact scratch buffer size (in bytes) needed by
// point_msm_pippenger_impl_async for a given input count n. The formula must
// stay in sync with the scratch partitioning inside that function:
// all_block_buckets: num_windows * num_blocks_per_window * bucket_count
// all_final_buckets: num_windows * bucket_count
// window_sums: num_windows
// Factoring this into a helper avoids duplicating the formula in every caller
// and prevents the buffer-underallocation bug that occurs when callers use
// ad-hoc estimates.
// all_block_buckets: num_windows * num_blocks * bucket_count
// all_final_buckets: num_windows * bucket_count
// window_sums: num_windows
//
// d_signed_digits is allocated internally (stream-ordered) and is NOT included
// here; callers only need to provide the ProjectiveType scratch buffer.
template <typename AffineType, typename ProjectiveType>
size_t pippenger_scratch_size(uint32_t n, uint32_t gpu_index) {
if (n == 0)
return 0;
uint32_t window_size, bucket_count;
// Use the same window parameter selection as the MSM entry points
if constexpr (std::is_same_v<AffineType, G1Affine>) {
get_g1_window_params(n, window_size, bucket_count);
} else {
@@ -673,16 +739,15 @@ size_t pippenger_scratch_size(uint32_t n, uint32_t gpu_index) {
}
const uint32_t threads_per_block = msm_threads_per_block<AffineType>(n);
const uint32_t num_windows = CEIL_DIV(Scalar::NUM_BITS, window_size);
const uint32_t num_windows = (Scalar::NUM_BITS + window_size) / window_size;
// Phase1KernelLaunchParams computes the adjusted threads per block
// respecting shared memory limits, which determines num_blocks_per_window
Phase1KernelLaunchParams<AffineType> launch_params(n, threads_per_block,
bucket_count, gpu_index);
const size_t all_block_buckets_elems = static_cast<size_t>(num_windows) *
launch_params.num_blocks_per_window *
bucket_count;
const size_t num_blocks =
static_cast<size_t>(launch_params.num_blocks_per_window);
const size_t all_block_buckets_elems =
static_cast<size_t>(num_windows) * num_blocks * bucket_count;
const size_t all_final_buckets_elems =
static_cast<size_t>(num_windows) * bucket_count;
const size_t total_elems =

View File

@@ -7,8 +7,6 @@
#include <cuda_runtime.h>
// For CUDA device code, we use __constant__ memory
// Constants are hardcoded at compile time (like sppark) to avoid
// cudaMemcpyToSymbol
// Note: DEVICE_MODULUS is in normal form (not Montgomery)
__constant__ const Fp DEVICE_MODULUS = {BLS12_446_MODULUS_LIMBS};
@@ -104,7 +102,7 @@ __host__ __device__ ComparisonType fp_cmp(const Fp &a, const Fp &b) {
__host__ __device__ bool fp_is_zero(const Fp &a) {
// By doing this way we avoid branching
uint64_t acc = 0;
UNSIGNED_LIMB acc = 0;
for (int i = 0; i < FP_LIMBS; i++) {
acc |= a.limb[i];
}
@@ -114,8 +112,8 @@ __host__ __device__ bool fp_is_zero(const Fp &a) {
__host__ __device__ bool fp_is_one(const Fp &a) {
if (a.limb[0] != 1)
return false;
// By doing this way we avoid branching
uint64_t acc = 0;
// All higher limbs must be zero.
UNSIGNED_LIMB acc = 0;
for (int i = 1; i < FP_LIMBS; i++) {
acc |= a.limb[i];
}
@@ -207,6 +205,40 @@ __host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
"l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
"l"(b.limb[5]), "l"(b.limb[6]));
return carry_out;
#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
// 32-bit PTX carry chain: add.cc.u32 sets the hardware carry flag,
// addc.cc.u32 propagates it. Eliminates software carry-detect comparisons
// across all 14 limbs.
// Operand map: %0..%13 = c[0..13], %14 = carry_out,
// %15..%28 = a[0..13], %29..%42 = b[0..13].
uint32_t carry_out;
asm("add.cc.u32 %0, %15, %29;\n\t" // c[0] = a[0] + b[0], set CF
"addc.cc.u32 %1, %16, %30;\n\t" // c[1] = a[1] + b[1] + CF
"addc.cc.u32 %2, %17, %31;\n\t" // c[2] = a[2] + b[2] + CF
"addc.cc.u32 %3, %18, %32;\n\t" // c[3] = a[3] + b[3] + CF
"addc.cc.u32 %4, %19, %33;\n\t" // c[4] = a[4] + b[4] + CF
"addc.cc.u32 %5, %20, %34;\n\t" // c[5] = a[5] + b[5] + CF
"addc.cc.u32 %6, %21, %35;\n\t" // c[6] = a[6] + b[6] + CF
"addc.cc.u32 %7, %22, %36;\n\t" // c[7] = a[7] + b[7] + CF
"addc.cc.u32 %8, %23, %37;\n\t" // c[8] = a[8] + b[8] + CF
"addc.cc.u32 %9, %24, %38;\n\t" // c[9] = a[9] + b[9] + CF
"addc.cc.u32 %10, %25, %39;\n\t" // c[10] = a[10] + b[10] + CF
"addc.cc.u32 %11, %26, %40;\n\t" // c[11] = a[11] + b[11] + CF
"addc.cc.u32 %12, %27, %41;\n\t" // c[12] = a[12] + b[12] + CF
"addc.cc.u32 %13, %28, %42;\n\t" // c[13] = a[13] + b[13] + CF
"addc.u32 %14, 0, 0;\n\t" // carry_out = 0 + 0 + CF (0 or 1)
: "=r"(c.limb[0]), "=r"(c.limb[1]), "=r"(c.limb[2]), "=r"(c.limb[3]),
"=r"(c.limb[4]), "=r"(c.limb[5]), "=r"(c.limb[6]), "=r"(c.limb[7]),
"=r"(c.limb[8]), "=r"(c.limb[9]), "=r"(c.limb[10]), "=r"(c.limb[11]),
"=r"(c.limb[12]), "=r"(c.limb[13]), "=r"(carry_out)
: "r"(a.limb[0]), "r"(a.limb[1]), "r"(a.limb[2]), "r"(a.limb[3]),
"r"(a.limb[4]), "r"(a.limb[5]), "r"(a.limb[6]), "r"(a.limb[7]),
"r"(a.limb[8]), "r"(a.limb[9]), "r"(a.limb[10]), "r"(a.limb[11]),
"r"(a.limb[12]), "r"(a.limb[13]), "r"(b.limb[0]), "r"(b.limb[1]),
"r"(b.limb[2]), "r"(b.limb[3]), "r"(b.limb[4]), "r"(b.limb[5]),
"r"(b.limb[6]), "r"(b.limb[7]), "r"(b.limb[8]), "r"(b.limb[9]),
"r"(b.limb[10]), "r"(b.limb[11]), "r"(b.limb[12]), "r"(b.limb[13]));
return static_cast<UNSIGNED_LIMB>(carry_out);
#else
// Host path: portable software carry detection
UNSIGNED_LIMB carry = 0;
@@ -248,6 +280,41 @@ __host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
// subc.u64 with 0-0-CF produces 0 if no borrow, or 0xFFFFFFFFFFFFFFFF if
// borrow. Normalize to 0/1 for callers that check (borrow != 0) or add it.
return borrow_out & 1;
#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
// 32-bit PTX borrow chain: sub.cc.u32 sets the hardware borrow flag,
// subc.cc.u32 propagates it across all 14 limbs.
// subc.u32 with 0-0-BF gives 0xFFFFFFFF on borrow; normalize to 0/1.
// Operand map: %0..%13 = c[0..13], %14 = borrow_out,
// %15..%28 = a[0..13], %29..%42 = b[0..13].
uint32_t borrow_out;
asm("sub.cc.u32 %0, %15, %29;\n\t" // c[0] = a[0] - b[0], set BF
"subc.cc.u32 %1, %16, %30;\n\t" // c[1] = a[1] - b[1] - BF
"subc.cc.u32 %2, %17, %31;\n\t" // c[2] = a[2] - b[2] - BF
"subc.cc.u32 %3, %18, %32;\n\t" // c[3] = a[3] - b[3] - BF
"subc.cc.u32 %4, %19, %33;\n\t" // c[4] = a[4] - b[4] - BF
"subc.cc.u32 %5, %20, %34;\n\t" // c[5] = a[5] - b[5] - BF
"subc.cc.u32 %6, %21, %35;\n\t" // c[6] = a[6] - b[6] - BF
"subc.cc.u32 %7, %22, %36;\n\t" // c[7] = a[7] - b[7] - BF
"subc.cc.u32 %8, %23, %37;\n\t" // c[8] = a[8] - b[8] - BF
"subc.cc.u32 %9, %24, %38;\n\t" // c[9] = a[9] - b[9] - BF
"subc.cc.u32 %10, %25, %39;\n\t" // c[10] = a[10] - b[10] - BF
"subc.cc.u32 %11, %26, %40;\n\t" // c[11] = a[11] - b[11] - BF
"subc.cc.u32 %12, %27, %41;\n\t" // c[12] = a[12] - b[12] - BF
"subc.cc.u32 %13, %28, %42;\n\t" // c[13] = a[13] - b[13] - BF
"subc.u32 %14, 0, 0;\n\t" // borrow_out = 0 - 0 - BF (0 or
// 0xFFFFFFFF)
: "=r"(c.limb[0]), "=r"(c.limb[1]), "=r"(c.limb[2]), "=r"(c.limb[3]),
"=r"(c.limb[4]), "=r"(c.limb[5]), "=r"(c.limb[6]), "=r"(c.limb[7]),
"=r"(c.limb[8]), "=r"(c.limb[9]), "=r"(c.limb[10]), "=r"(c.limb[11]),
"=r"(c.limb[12]), "=r"(c.limb[13]), "=r"(borrow_out)
: "r"(a.limb[0]), "r"(a.limb[1]), "r"(a.limb[2]), "r"(a.limb[3]),
"r"(a.limb[4]), "r"(a.limb[5]), "r"(a.limb[6]), "r"(a.limb[7]),
"r"(a.limb[8]), "r"(a.limb[9]), "r"(a.limb[10]), "r"(a.limb[11]),
"r"(a.limb[12]), "r"(a.limb[13]), "r"(b.limb[0]), "r"(b.limb[1]),
"r"(b.limb[2]), "r"(b.limb[3]), "r"(b.limb[4]), "r"(b.limb[5]),
"r"(b.limb[6]), "r"(b.limb[7]), "r"(b.limb[8]), "r"(b.limb[9]),
"r"(b.limb[10]), "r"(b.limb[11]), "r"(b.limb[12]), "r"(b.limb[13]));
return static_cast<UNSIGNED_LIMB>(borrow_out & 1u);
#else
// Host path: portable software borrow detection
UNSIGNED_LIMB borrow = 0;
@@ -287,6 +354,17 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
UNSIGNED_LIMB mask =
-use_original; // all-ones if keep sum, all-zeros if keep reduced
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
}
#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
// Same branchless logic as the 64-bit path; mask arithmetic is identical
// since UNSIGNED_LIMB is uint32_t: -1u == 0xFFFFFFFF (all-ones).
Fp reduced;
UNSIGNED_LIMB borrow = fp_sub_raw(reduced, sum, fp_modulus());
UNSIGNED_LIMB use_original = ((carry ^ 1u) & borrow);
UNSIGNED_LIMB mask = -use_original;
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
}
@@ -319,6 +397,15 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
UNSIGNED_LIMB mask =
-borrow; // all-ones if borrow (use corrected), all-zeros if not
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
}
#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
// Same branchless logic as the 64-bit path; -1u == 0xFFFFFFFF for uint32_t.
Fp corrected;
fp_add_raw(corrected, diff, fp_modulus());
UNSIGNED_LIMB mask = -borrow;
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
}
@@ -333,6 +420,26 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
#endif
}
// Lazy addition: c = a + b, result in [0, 2p) for inputs in [0, p).
// Skips the conditional subtraction of fp_add; valid as input to fp_mont_mul
// since CIOS accepts operands in [0, 2p).
__host__ __device__ void fp_add_lazy(Fp &c, const Fp &a, const Fp &b) {
fp_add_raw(c, a, b);
}
// Lazy subtraction: c ≡ a - b (mod p), result in [0, 2p) for inputs in [0, p).
// Adds p unconditionally (no borrow-select), saving one conditional branch.
// Valid as input to fp_mont_mul; must NOT be used where [0, p) is
// required (e.g. final results, inputs to fp_sub/fp_neg).
__host__ __device__ void fp_sub_lazy(Fp &c, const Fp &a, const Fp &b) {
Fp diff;
fp_sub_raw(diff, a, b); // a - b, borrow absorbed into bit pattern
fp_add_raw(c, diff, fp_modulus()); // always add p; carry discarded
// For a >= b (no borrow): diff = a-b ∈ [0,p), result = a-b+p ∈ [p,2p) ✓
// For a < b (borrow=1): diff wraps, result = a-b+2^N+p mod 2^N = a-b+p ∈
// [0,p) ✓
}
// Small-constant multiplication via addition chains.
// These replace full Montgomery multiplications by 2, 3, 4, 8 with a few
// modular additions, each ~25 instructions vs ~200+ for CIOS Montgomery mul.
@@ -483,14 +590,32 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
for (int i = 0; i < FP_LIMBS; i++) {
UNSIGNED_LIMB u = t[i] * p_prime; // u = t[i] * p' mod 2^LIMB_BITS
// Add u * p to t, starting at position i
// Add u * p to t, starting at position i.
// Use uint64_t accumulator in 32-bit mode to avoid carry overflow:
// hi + carry1 + carry2 can reach 2^32 which overflows uint32_t.
#if LIMB_BITS_CONFIG == 32
uint64_t carry = 0;
for (int j = 0; j < FP_LIMBS; j++) {
uint64_t acc =
(uint64_t)t[i + j] + (uint64_t)u * (uint64_t)p.limb[j] + carry;
t[i + j] = (UNSIGNED_LIMB)acc;
carry = acc >> LIMB_BITS;
}
// Propagate remaining carry (carry ≤ 2^32-1 at this point)
int idx = i + FP_LIMBS;
while (carry != 0 && idx <= 2 * FP_LIMBS) {
uint64_t acc = (uint64_t)t[idx] + carry;
t[idx] = (UNSIGNED_LIMB)acc;
carry = acc >> LIMB_BITS;
idx++;
}
#else
UNSIGNED_LIMB carry = 0;
for (int j = 0; j < FP_LIMBS; j++) {
UNSIGNED_LIMB hi, lo;
mul_limbs(u, p.limb[j], hi, lo);
// Three-way addition: t[i+j] + lo + carry
// Do it in two steps to handle carries properly
UNSIGNED_LIMB temp = t[i + j] + lo;
UNSIGNED_LIMB carry1 = (temp < t[i + j]) ? 1 : 0;
@@ -499,7 +624,6 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
t[i + j] = sum;
// Next carry is hi + carry1 + carry2
carry = hi + carry1 + carry2;
}
@@ -511,6 +635,7 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
t[idx] = sum;
idx++;
}
#endif
}
// Result is in t[FP_LIMBS..2*FP_LIMBS-1] (high half)
@@ -534,29 +659,7 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
}
}
// ============================================================================
// PTX-accelerated CIOS Montgomery multiplication (device path)
// ============================================================================
// The CIOS algorithm for 7 x 64-bit limbs executes 98 multiply-accumulate
// steps across 7 outer iterations. Each step computes:
// (carry, t[j]) = t[j] + a[j] * b_i + carry
// which is a 64x64->128 multiply plus a three-operand addition with carry.
//
// The C++ path uses software carry detection: carry = (sum < old) ? 1 : 0.
// The PTX path below uses hardware carry flags via the .cc suffix:
// - mul.lo.u64 / mul.hi.u64 : 64x64->128 wide multiply
// - add.cc.u64 / addc.u64 : addition chain with hardware carry flag
//
// Each multiply-accumulate step uses 6 PTX instructions instead of ~10+ in
// the software-carry version. The 7 outer iterations are fully unrolled, and
// the limb-shift loop (t[j] = t[j+1]) is eliminated by register renaming.
//
// REGISTER ALIASING NOTE: All PTX temporaries (_lo, _hi) are declared as
// .reg inside the asm block. This prevents nvcc's register allocator from
// aliasing them with C operands (t_j, carry), which was the root cause of
// previous correctness bugs where "+l" outputs could share registers with
// "l" inputs in the same asm statement.
// ============================================================================
#ifdef __CUDA_ARCH__
#if LIMB_BITS_CONFIG == 64
@@ -735,14 +838,408 @@ __device__ __noinline__ void fp_mont_mul_cios_ptx(Fp &c, const Fp &a,
#endif // LIMB_BITS_CONFIG == 64
#endif // __CUDA_ARCH__
// 32-bit dual MAD-chain Montgomery multiplication (device path)
#ifdef __CUDA_ARCH__
// PTX carry-chain primitives for 32-bit Montgomery arithmetic.
//
// These are macros rather than __forceinline__ functions because the hardware
// carry flag (CC register) does not survive a function-call boundary
// (lo, hi) = a * b : 64-bit product, no carry in or out.
// Initialises a fresh wide accumulator slot.
#define FP_MUL_WIDE_32(lo, hi, a, b) \
asm("mul.lo.u32 %0, %2, %3; mul.hi.u32 %1, %2, %3;" \
: "=r"(lo), "=r"(hi) \
: "r"(a), "r"(b))
// lo += lo(a*b); hi += hi(a*b) + CC. Sets CC.
// Opens a carry chain (mad.lo.cc / madc.hi.cc).
#define FP_MAD_WIDE_CC_32(lo, hi, a, b) \
asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;" \
: "+r"(lo), "+r"(hi) \
: "r"(a), "r"(b))
// lo += lo(a*b) + CC; hi += hi(a*b) + CC. Sets CC.
// Continues a carry chain (madc.lo.cc / madc.hi.cc).
#define FP_MADC_WIDE_CC_32(lo, hi, a, b) \
asm("madc.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;" \
: "+r"(lo), "+r"(hi) \
: "r"(a), "r"(b))
// r += CC. No carry out terminates a carry chain.
#define FP_ADDC_32(r) asm("addc.u32 %0, %0, 0;" : "+r"(r))
// dst = src + CC. No carry out
#define FP_ADDC_INTO_32(dst, src) \
asm("addc.u32 %0, %1, 0;" : "=r"(dst) : "r"(src))
// r = CC (capture carry flag as 0 or 1). No carry out.
#define FP_CARRY_32(r) asm("addc.u32 %0, 0, 0;" : "=r"(r))
// dst = src + src. Sets CC opens a left-shift doubling chain.
#define FP_DBL_CC_32(dst, src) \
asm("add.cc.u32 %0, %1, %1;" : "=r"(dst) : "r"(src))
// r = r + r + CC. Sets CC continues a left-shift doubling chain.
#define FP_DBLC_CC_32(r) asm("addc.cc.u32 %0, %0, %0;" : "+r"(r))
/// dst = lo32 | (hi32 << 32): pack two 32-bit halves into one 64-bit register.
#define FP_PACK_U64(dst, lo32, hi32) \
asm("mov.b64 %0, {%1, %2};" : "=l"(dst) : "r"(lo32), "r"(hi32))
// Initialize acc[0..n-1] with products of every other element of a and bi.
// For each j (step 2): acc[j] = lo(a[j]*bi), acc[j+1] = hi(a[j]*bi).
static __device__ __forceinline__ void
fp_mul_n_32(uint32_t *acc, const uint32_t *a, uint32_t bi, int n) {
#pragma unroll
for (int j = 0; j < n; j += 2) {
asm("mul.lo.u32 %0, %1, %2;" : "=r"(acc[j]) : "r"(a[j]), "r"(bi));
asm("mul.hi.u32 %0, %1, %2;" : "=r"(acc[j + 1]) : "r"(a[j]), "r"(bi));
}
}
// Multiply-accumulate across n limbs with a hardware carry chain.
// First pair uses mad.lo.cc + madc.hi.cc (initiates the chain).
// Remaining pairs continue with madc.lo.cc + madc.hi.cc.
// Carry flag exits in CC on return; caller must consume it.
static __device__ __forceinline__ void
fp_cmad_n_32(uint32_t *acc, const uint32_t *a, uint32_t bi, int n) {
asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;"
: "+r"(acc[0]), "+r"(acc[1])
: "r"(a[0]), "r"(bi));
#pragma unroll
for (int j = 2; j < n; j += 2)
asm("madc.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;"
: "+r"(acc[j]), "+r"(acc[j + 1])
: "r"(a[j]), "r"(bi));
// CC holds the final carry on return
}
// Multiply-accumulate with implicit right-shift of odd by two positions.
// Each pair: odd[j] = lo/hi(a[j]*bi) + old_odd[j+2] + CC.
// Reads are always two positions ahead of writes so forward iteration is safe.
// Final pair terminates the chain with addend=0 and no carry-out (.hi only).
static __device__ __forceinline__ void
fp_madc_n_rshift_32(uint32_t *odd, const uint32_t *a, uint32_t bi, int n) {
#pragma unroll
for (int j = 0; j < n - 2; j += 2)
asm("madc.lo.cc.u32 %0, %2, %3, %4; madc.hi.cc.u32 %1, %2, %3, %5;"
: "=r"(odd[j]), "=r"(odd[j + 1])
: "r"(a[j]), "r"(bi), "r"(odd[j + 2]), "r"(odd[j + 3]));
asm("madc.lo.cc.u32 %0, %2, %3, 0; madc.hi.u32 %1, %2, %3, 0;"
: "=r"(odd[n - 2]), "=r"(odd[n - 1])
: "r"(a[n - 2]), "r"(bi));
// Note: final madc.hi.u32 has no .cc so CC is clear on return
}
// After the call even[0] == 0 (by the Montgomery invariant), so the next
// iteration's right-shift effectively advances the window by one limb.
static __device__ __forceinline__ void
fp_mad_n_redc_32(uint32_t *even, uint32_t *odd, const uint32_t *a,
const uint32_t *p, uint32_t bi, uint32_t M0, bool first) {
constexpr int n = 14; // 32-bit limbs for BLS12-446 (446 bits → 14 × 32-bit)
if (first) {
// Fresh initialization: no carry from previous iteration.
// even[2j] = lo(a[2j] * bi), even[2j+1] = hi(a[2j] * bi)
// odd[2j] = lo(a[2j+1] * bi), odd[2j+1] = hi(a[2j+1] * bi)
fp_mul_n_32(even, a, bi, n);
fp_mul_n_32(odd, a + 1, bi, n);
} else {
// Merge carry from previous iteration and advance both accumulators.
asm("add.cc.u32 %0, %0, %1;" : "+r"(even[0]) : "r"(odd[1]));
fp_madc_n_rshift_32(odd, a + 1, bi, n);
fp_cmad_n_32(even, a, bi, n);
asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
}
// Montgomery reduction: choose mi so that even[0] + lo(p[0]*mi) = 0 mod 2^32
uint32_t mi = even[0] * M0;
fp_cmad_n_32(odd, p + 1, mi, n);
fp_cmad_n_32(even, p, mi, n);
asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
}
// Carry-add: acc[i] += a[i] for i = 0..n-1 with PTX carry chain.
// Starts with add.cc (initiates chain); all subsequent adds use addc.cc.
// Carry flag is left set in CC on return for the caller to consume.
static __device__ __forceinline__ void fp_cadd_n_32(uint32_t *acc,
const uint32_t *a, int n) {
asm("add.cc.u32 %0, %0, %1;" : "+r"(acc[0]) : "r"(a[0]));
#pragma unroll
for (int i = 1; i < n; i++)
asm("addc.cc.u32 %0, %0, %1;" : "+r"(acc[i]) : "r"(a[i]));
}
// Even row of the upper-triangle squaring pass.
// Adds a[1..n-2]*bi into odd[0..n-3] (cmad chain), places a[n-1]*bi into
// odd[n-2..n-1] fresh (terminates carry), then adds a[0..n-1]*bi into
// even[0..n-1] (independent cmad chain), folding the even carry into odd[n-1].
static __device__ __forceinline__ void fp_mad_row_32(uint32_t *odd,
uint32_t *even,
const uint32_t *a,
uint32_t bi, int n) {
fp_cmad_n_32(odd, a + 1, bi, n - 2);
asm("madc.lo.cc.u32 %0, %2, %3, 0; madc.hi.u32 %1, %2, %3, 0;"
: "=r"(odd[n - 2]), "=r"(odd[n - 1])
: "r"(a[n - 1]), "r"(bi));
fp_cmad_n_32(even, a, bi, n);
asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
}
// Odd row of the upper-triangle squaring pass.
// Adds a[0..n-3]*bi into odd[0..n-3] (cmad chain), places a[n-2]*bi into
// odd[n-2..n-1] fresh, then adds a[1..n-2]*bi into even[0..n-3] (n-2 terms),
// folding the even carry into odd[n-1].
static __device__ __forceinline__ void fp_qad_row_32(uint32_t *odd,
uint32_t *even,
const uint32_t *a,
uint32_t bi, int n) {
fp_cmad_n_32(odd, a, bi, n - 2);
asm("madc.lo.cc.u32 %0, %2, %3, 0; madc.hi.u32 %1, %2, %3, 0;"
: "=r"(odd[n - 2]), "=r"(odd[n - 1])
: "r"(a[n - 2]), "r"(bi));
fp_cmad_n_32(even, a + 1, bi, n - 2);
asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
}
// One Montgomery-reduction row without a multiply step (b_i = 0).
// Used by fp_mont_sqr_mad32 to reduce the lower n words of the wide product.
// Mirrors fp_mad_n_redc_32 but omits the initial product accumulation, leaving
// only the annihilation step that drives even[0] to zero.
static __device__ __forceinline__ void
fp_mul_by_1_row_32(uint32_t *even, uint32_t *odd, const uint32_t *p,
uint32_t M0, bool first) {
constexpr int n = 14;
// mi removes even[0]: even[0] + lo(p[0]*mi) == 0 mod 2^32.
// IMPORTANT: mi must be computed from even[0] *after* any add.cc that
// modifies it. Plain integer multiply does not touch CC.
uint32_t mi;
if (first) {
mi = even[0] * M0;
fp_mul_n_32(odd, p + 1, mi, n);
fp_cmad_n_32(even, p, mi, n);
asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
} else {
// Absorb the shifted carry word from the previous step, then reduce.
asm("add.cc.u32 %0, %0, %1;" : "+r"(even[0]) : "r"(odd[1]));
// Use PTX mul explicitly: a plain C multiply after add.cc could in theory
// let the compiler insert an instruction that clobbers CC before
// madc_n_rshift.
asm("mul.lo.u32 %0, %1, %2;" : "=r"(mi) : "r"(even[0]), "r"(M0));
fp_madc_n_rshift_32(odd, p + 1, mi, n);
fp_cmad_n_32(even, p, mi, n);
asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
}
}
// Montgomery squaring using CIOS with triangular 32-bit MAD chains.
// See fp_mont_mul_mad32 for the algorithm reference (Koç et al., 1996).
//
// Computes c = a^2 * R^{-1} mod p (input and output in Montgomery form).
__device__ __noinline__ void fp_mont_sqr_mad32(Fp &c, const Fp &a) {
constexpr int n = 14;
const uint32_t *a32 = reinterpret_cast<const uint32_t *>(a.limb);
const uint32_t *p32 = reinterpret_cast<const uint32_t *>(DEVICE_MODULUS.limb);
const uint32_t M0 = static_cast<uint32_t>(DEVICE_P_PRIME);
uint32_t wide[2 * n], wtemp[2 * n - 2];
// Phase 1: upper triangle a[i]*a[j] for j > i
fp_mul_n_32(wtemp, a32 + 1, a32[0], n);
fp_mul_n_32(wide + 2, a32 + 2, a32[0], n - 2);
#pragma unroll
for (int i = 2; i <= n - 4; i += 2) {
fp_mad_row_32(&wide[2 * i], &wtemp[2 * i - 2], &a32[i], a32[i - 1], n - i);
fp_qad_row_32(&wtemp[2 * i], &wide[2 * i + 2], &a32[i + 1], a32[i], n - i);
}
FP_MUL_WIDE_32(wide[2 * n - 4], wide[2 * n - 3], a32[n - 1], a32[n - 3]);
FP_MAD_WIDE_CC_32(wtemp[2 * n - 6], wtemp[2 * n - 5], a32[n - 2], a32[n - 3]);
FP_ADDC_32(wide[2 * n - 3]);
FP_MUL_WIDE_32(wtemp[2 * n - 4], wtemp[2 * n - 3], a32[n - 1], a32[n - 2]);
fp_cadd_n_32(&wide[2], &wtemp[1], 2 * n - 4);
FP_ADDC_INTO_32(wide[2 * n - 2], wtemp[2 * n - 3]);
// Phase 2: double the upper-triangle sum (left-shift the 2n-bit value by 1)
wide[0] = 0;
FP_DBL_CC_32(wide[1], wtemp[0]);
#pragma unroll
for (int j = 2; j < 2 * n - 1; j++)
FP_DBLC_CC_32(wide[j]);
FP_CARRY_32(wide[2 * n - 1]);
// Phase 3: add diagonal a[i]^2 terms (squares of each limb)
FP_MAD_WIDE_CC_32(wide[0], wide[1], a32[0], a32[0]);
#pragma unroll
for (int i = 1; i < n; i++)
FP_MADC_WIDE_CC_32(wide[2 * i], wide[2 * i + 1], a32[i], a32[i]);
// Phase 4: Montgomery reduction
uint32_t red_odd[n];
#pragma unroll
for (int i = 0; i < n; i += 2) {
fp_mul_by_1_row_32(&wide[0], &red_odd[0], p32, M0, i == 0);
fp_mul_by_1_row_32(&red_odd[0], &wide[0], p32, M0, false);
}
// Merge the final red_odd word into wide[0..n-1].
fp_cadd_n_32(&wide[0], &red_odd[1], n - 1);
FP_ADDC_32(wide[n - 1]);
// Add reduced lower half into upper half wide[n..2n-1]; the result lives
// in wide[n..2n-1] and is in [0, 2p).
fp_cadd_n_32(&wide[n], &wide[0], n);
FP_CARRY_32(wide[0]); // discard overflow (always 0 for p<2^446)
#if LIMB_BITS_CONFIG == 64
// Pack uint32_t pairs back into uint64_t limbs.
#pragma unroll
for (int j = 0; j < 7; j++)
FP_PACK_U64(c.limb[j], wide[n + 2 * j], wide[n + 2 * j + 1]);
const uint64_t p0 = DEVICE_MODULUS.limb[0], p1 = DEVICE_MODULUS.limb[1],
p2 = DEVICE_MODULUS.limb[2], p3 = DEVICE_MODULUS.limb[3],
p4 = DEVICE_MODULUS.limb[4], p5 = DEVICE_MODULUS.limb[5],
p6 = DEVICE_MODULUS.limb[6];
uint64_t r0, r1, r2, r3, r4, r5, r6, mask64;
asm("sub.cc.u64 %0, %8, %15;\n\t"
"subc.cc.u64 %1, %9, %16;\n\t"
"subc.cc.u64 %2, %10, %17;\n\t"
"subc.cc.u64 %3, %11, %18;\n\t"
"subc.cc.u64 %4, %12, %19;\n\t"
"subc.cc.u64 %5, %13, %20;\n\t"
"subc.cc.u64 %6, %14, %21;\n\t"
"subc.u64 %7, 0, 0;\n\t"
"shr.s64 %7, %7, 63;\n\t"
: "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
"=l"(mask64)
: "l"(c.limb[0]), "l"(c.limb[1]), "l"(c.limb[2]), "l"(c.limb[3]),
"l"(c.limb[4]), "l"(c.limb[5]), "l"(c.limb[6]), "l"(p0), "l"(p1),
"l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6));
c.limb[0] = (c.limb[0] & mask64) | (r0 & ~mask64);
c.limb[1] = (c.limb[1] & mask64) | (r1 & ~mask64);
c.limb[2] = (c.limb[2] & mask64) | (r2 & ~mask64);
c.limb[3] = (c.limb[3] & mask64) | (r3 & ~mask64);
c.limb[4] = (c.limb[4] & mask64) | (r4 & ~mask64);
c.limb[5] = (c.limb[5] & mask64) | (r5 & ~mask64);
c.limb[6] = (c.limb[6] & mask64) | (r6 & ~mask64);
#else
#pragma unroll
for (int j = 0; j < n; j++)
c.limb[j] = wide[n + j];
Fp reduced;
UNSIGNED_LIMB borrow = fp_sub_raw(reduced, c, fp_modulus());
UNSIGNED_LIMB mask32 = -borrow;
#pragma unroll
for (int j = 0; j < n; j++)
c.limb[j] = (c.limb[j] & mask32) | (reduced.limb[j] & ~mask32);
#endif
}
// Montgomery multiplication using CIOS (Coarsely Integrated Operand Scanning):
// Computes c = a * b * R^{-1} mod p (all operands in Montgomery form).
// Inputs are stored as uint64_t[7]; they are reinterpreted as uint32_t[14]
// (little-endian: a64[j] == a32[2j] | (a32[2j+1] << 32)).
__device__ __noinline__ void fp_mont_mul_mad32(Fp &c, const Fp &a,
const Fp &b) {
constexpr int n = 14;
// Reinterpret 64-bit limb arrays as 32-bit on little-endian hardware.
const uint32_t *a32 = reinterpret_cast<const uint32_t *>(a.limb);
const uint32_t *b32 = reinterpret_cast<const uint32_t *>(b.limb);
const uint32_t *p32 = reinterpret_cast<const uint32_t *>(DEVICE_MODULUS.limb);
// 32-bit Montgomery constant: low 32 bits of DEVICE_P_PRIME.
// Correct because -p^{-1} mod 2^32 == (-p^{-1} mod 2^64) mod 2^32.
const uint32_t M0 = static_cast<uint32_t>(DEVICE_P_PRIME);
uint32_t even[n], odd[n];
// Process every 32-bit limb of b in pairs, alternating primary accumulator.
#pragma unroll
for (int i = 0; i < n; i += 2) {
fp_mad_n_redc_32(even, odd, a32, p32, b32[i], M0, i == 0);
fp_mad_n_redc_32(odd, even, a32, p32, b32[i + 1], M0, false);
}
// Merge: even[0..n-2] += odd[1..n-1], propagate final carry into even[n-1].
fp_cadd_n_32(even, odd + 1, n - 1);
FP_ADDC_32(even[n - 1]);
// Pack and final reduction layout depends on LIMB_BITS_CONFIG.
// In both cases UNSIGNED_LIMB* and uint32_t* point to the same 56-byte block.
#if LIMB_BITS_CONFIG == 64
// 64-bit limbs: pack pairs into uint64_t with PTX mov.b64, then do a
// branchless 7-limb 64-bit conditional subtraction.
#pragma unroll
for (int j = 0; j < 7; j++)
FP_PACK_U64(c.limb[j], even[2 * j], even[2 * j + 1]);
// subc.u64 0-0-borrow gives 0xFFFF... when c<p (keep), 0 when c>=p (reduce).
// shr.s64 sign-extends to a per-bit selection mask.
const uint64_t p0 = DEVICE_MODULUS.limb[0], p1 = DEVICE_MODULUS.limb[1],
p2 = DEVICE_MODULUS.limb[2], p3 = DEVICE_MODULUS.limb[3],
p4 = DEVICE_MODULUS.limb[4], p5 = DEVICE_MODULUS.limb[5],
p6 = DEVICE_MODULUS.limb[6];
uint64_t r0, r1, r2, r3, r4, r5, r6, mask64;
asm("sub.cc.u64 %0, %8, %15;\n\t"
"subc.cc.u64 %1, %9, %16;\n\t"
"subc.cc.u64 %2, %10, %17;\n\t"
"subc.cc.u64 %3, %11, %18;\n\t"
"subc.cc.u64 %4, %12, %19;\n\t"
"subc.cc.u64 %5, %13, %20;\n\t"
"subc.cc.u64 %6, %14, %21;\n\t"
"subc.u64 %7, 0, 0;\n\t"
"shr.s64 %7, %7, 63;\n\t"
: "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
"=l"(mask64)
: "l"(c.limb[0]), "l"(c.limb[1]), "l"(c.limb[2]), "l"(c.limb[3]),
"l"(c.limb[4]), "l"(c.limb[5]), "l"(c.limb[6]), "l"(p0), "l"(p1),
"l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6));
c.limb[0] = (c.limb[0] & mask64) | (r0 & ~mask64);
c.limb[1] = (c.limb[1] & mask64) | (r1 & ~mask64);
c.limb[2] = (c.limb[2] & mask64) | (r2 & ~mask64);
c.limb[3] = (c.limb[3] & mask64) | (r3 & ~mask64);
c.limb[4] = (c.limb[4] & mask64) | (r4 & ~mask64);
c.limb[5] = (c.limb[5] & mask64) | (r5 & ~mask64);
c.limb[6] = (c.limb[6] & mask64) | (r6 & ~mask64);
#else
#pragma unroll
for (int j = 0; j < n; j++)
c.limb[j] = even[j];
Fp reduced;
UNSIGNED_LIMB borrow = fp_sub_raw(reduced, c, fp_modulus());
UNSIGNED_LIMB mask32 = -borrow; // all-ones if c<p (keep), all-zeros if c>=p
#pragma unroll
for (int j = 0; j < n; j++)
c.limb[j] = (c.limb[j] & mask32) | (reduced.limb[j] & ~mask32);
#endif
}
#undef FP_MUL_WIDE_32
#undef FP_MAD_WIDE_CC_32
#undef FP_MADC_WIDE_CC_32
#undef FP_ADDC_32
#undef FP_ADDC_INTO_32
#undef FP_CARRY_32
#undef FP_DBL_CC_32
#undef FP_DBLC_CC_32
#undef FP_PACK_U64
#endif // __CUDA_ARCH__
// CIOS (Coarsely Integrated Operand Scanning) Montgomery multiplication
// Fuses multiplication and reduction in a single pass for better efficiency.
// Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
// Both a and b are in Montgomery form, result is in Montgomery form.
__host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
// Device path: fully unrolled PTX with hardware carry flags
fp_mont_mul_cios_ptx(c, a, b);
#ifdef __CUDA_ARCH__
// Device path: 32-bit dual MAD chain
fp_mont_mul_mad32(c, a, b);
#else
// Host path: portable C++ implementation
const Fp &p = fp_modulus();
@@ -750,11 +1247,31 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
// Working array: only n+1 limbs needed (vs 2n for separate mul+reduce)
UNSIGNED_LIMB t[FP_LIMBS + 1];
// memset is not guaranteed available in all device compilation contexts;
// use an explicit loop which the compiler will unroll anyway.
#ifdef __CUDA_ARCH__
for (int i = 0; i <= FP_LIMBS; i++) {
t[i] = 0;
}
#else
memset(t, 0, (FP_LIMBS + 1) * sizeof(UNSIGNED_LIMB));
#endif
// Main CIOS loop: for each limb of b
for (int i = 0; i < FP_LIMBS; i++) {
// Step 1: Multiply-accumulate t += a * b[i]
#if LIMB_BITS_CONFIG == 32
uint64_t carry64 = 0;
for (int j = 0; j < FP_LIMBS; j++) {
uint64_t acc =
(uint64_t)t[j] + (uint64_t)a.limb[j] * (uint64_t)b.limb[i] + carry64;
t[j] = (UNSIGNED_LIMB)acc;
carry64 = acc >> LIMB_BITS;
}
uint64_t sum64 = (uint64_t)t[FP_LIMBS] + carry64;
UNSIGNED_LIMB overflow = (UNSIGNED_LIMB)(sum64 >> LIMB_BITS);
t[FP_LIMBS] = (UNSIGNED_LIMB)sum64;
#else
UNSIGNED_LIMB carry = 0;
for (int j = 0; j < FP_LIMBS; j++) {
UNSIGNED_LIMB hi, lo;
@@ -767,18 +1284,31 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
UNSIGNED_LIMB c2 = (sum2 < sum1) ? 1 : 0;
t[j] = sum2;
// carry = hi + c1 + c2
carry = hi + c1 + c2;
}
// Add carry to t[n]
UNSIGNED_LIMB sum = t[FP_LIMBS] + carry;
UNSIGNED_LIMB overflow = (sum < t[FP_LIMBS]) ? 1 : 0;
t[FP_LIMBS] = sum;
#endif
// Step 2: Reduction - compute m = t[0] * p' mod 2^LIMB_BITS
UNSIGNED_LIMB m = t[0] * p_prime;
// Add m * p to t (this zeros out t[0])
#if LIMB_BITS_CONFIG == 32
carry64 = 0;
for (int j = 0; j < FP_LIMBS; j++) {
uint64_t acc =
(uint64_t)t[j] + (uint64_t)m * (uint64_t)p.limb[j] + carry64;
t[j] = (UNSIGNED_LIMB)acc;
carry64 = acc >> LIMB_BITS;
}
// Merge carry from reduction with the overflow from step 1.
// sum64 ≤ (2^32-1) + (2^32-1) + 1 = 2^33-1, so the new overflow is 0 or 1.
uint64_t s64 = (uint64_t)t[FP_LIMBS] + carry64 + (uint64_t)overflow;
t[FP_LIMBS] = (UNSIGNED_LIMB)s64;
overflow = (UNSIGNED_LIMB)(s64 >> LIMB_BITS);
#else
carry = 0;
for (int j = 0; j < FP_LIMBS; j++) {
UNSIGNED_LIMB hi, lo;
@@ -800,6 +1330,7 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
UNSIGNED_LIMB c2 = (s2 < s1) ? 1 : 0;
t[FP_LIMBS] = s2;
overflow = c1 + c2; // Track overflow for final reduction
#endif
// Step 3: Shift right by one limb (divide by 2^LIMB_BITS)
// t[0..n-1] = t[1..n], t[n] = overflow
@@ -810,7 +1341,13 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
}
// Copy result to output
#ifdef __CUDA_ARCH__
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = t[i];
}
#else
memcpy(&c.limb[0], t, FP_LIMBS * sizeof(UNSIGNED_LIMB));
#endif
// Final reduction: if result >= p or there's overflow, subtract p
if (t[FP_LIMBS] != 0 || fp_cmp(c, p) != ComparisonType::Less) {
@@ -829,6 +1366,19 @@ __host__ __device__ void fp_mont_mul(Fp &c, const Fp &a, const Fp &b) {
fp_mont_mul_cios(c, a, b);
}
// Montgomery squaring: c = (a^2 * R_INV) mod p
// Input and output in Montgomery form.
// On device: uses fp_mont_sqr_mad32 (triangular MAD chain, ~30-40% fewer
// multiplications than fp_mont_mul(c, a, a)).
// On host: delegates to fp_mont_mul_cios(c, a, a).
__host__ __device__ void fp_mont_sqr(Fp &c, const Fp &a) {
#ifdef __CUDA_ARCH__
fp_mont_sqr_mad32(c, a);
#else
fp_mont_mul_cios(c, a, a);
#endif
}
// CONVERSION: Convert from normal form to Montgomery form
// Input a is in normal form, output c is in Montgomery form
// Uses CIOS: c = a * R^2 * R^-1 mod p = a * R mod p
@@ -900,9 +1450,9 @@ __host__ __device__ static void fp_pow_internal_mont(Fp &result,
int start_bit = (limb_idx == msb_idx) ? bit_pos : LIMB_BITS - 1;
for (int bit = start_bit; bit >= 0; bit--) {
// Square result
// Square result using the optimised squaring path
Fp temp;
fp_mont_mul(temp, result, result);
fp_mont_sqr(temp, result);
fp_copy(result, temp);
// Multiply by base if current bit is set
@@ -1081,7 +1631,7 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a) {
// Verify: c^2 should equal a (mod p) - using Montgomery form
Fp c_mont, c_squared_mont;
fp_to_montgomery(c_mont, c);
fp_mont_mul(c_squared_mont, c_mont, c_mont);
fp_mont_sqr(c_squared_mont, c_mont);
if (fp_cmp(c_squared_mont, a_mont) == ComparisonType::Equal) {
return true;
@@ -1091,7 +1641,7 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a) {
Fp alt_c, alt_c_mont;
fp_sub(alt_c, p, c);
fp_to_montgomery(alt_c_mont, alt_c);
fp_mont_mul(c_squared_mont, alt_c_mont, alt_c_mont);
fp_mont_sqr(c_squared_mont, alt_c_mont);
if (fp_cmp(c_squared_mont, a_mont) == ComparisonType::Equal) {
fp_copy(c, alt_c);
return true;
@@ -1103,7 +1653,7 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a) {
fp_sub(reduced_c, c, p);
fp_copy(c, reduced_c);
fp_to_montgomery(reduced_c_mont, reduced_c);
fp_mont_mul(c_squared_mont, reduced_c_mont, reduced_c_mont);
fp_mont_sqr(c_squared_mont, reduced_c_mont);
if (fp_cmp(c_squared_mont, a_mont) == ComparisonType::Equal) {
return true;
}

View File

@@ -74,6 +74,18 @@ __host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b) {
fp_sub(c.c1, a.c1, b.c1);
}
// Lazy add/sub for Fp2: component-wise fp_add_lazy / fp_sub_lazy.
// Outputs each component in [0, 2p); safe as input to fp2_mont_mul.
__host__ __device__ void fp2_add_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b) {
fp_add_lazy(c.c0, a.c0, b.c0);
fp_add_lazy(c.c1, a.c1, b.c1);
}
__host__ __device__ void fp2_sub_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b) {
fp_sub_lazy(c.c0, a.c0, b.c0);
fp_sub_lazy(c.c1, a.c1, b.c1);
}
// Small-constant multiplication via addition chains.
// These replace full Fp2 Montgomery multiplications by 2, 3, 4, 8 with
// modular additions on each component.
@@ -158,8 +170,10 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b) {
fp_mont_mul(t0, a.c0, b.c0);
fp_mont_mul(t1, a.c1, b.c1);
fp_add(t2, a.c0, a.c1);
fp_add(t3, b.c0, b.c1);
// Lazy add: skip the conditional subtraction since t2, t3 feed fp_mont_mul
// which accepts inputs in [0, 2p). Saves 2 conditional subtractions.
fp_add_lazy(t2, a.c0, a.c1);
fp_add_lazy(t3, b.c0, b.c1);
fp_mont_mul(t2, t2, t3);
fp_sub(c.c0, t0, t1);
fp_sub(c.c1, t2, t0);
@@ -176,8 +190,10 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b) {
__host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a) {
Fp sum, diff, c0_tmp, prod;
fp_add(sum, a.c0, a.c1);
fp_sub(diff, a.c0, a.c1);
// Lazy add/sub: sum and diff feed fp_mont_mul (accepts [0, 2p)).
// Saves 2 conditional subtractions vs canonical fp_add + fp_sub.
fp_add_lazy(sum, a.c0, a.c1);
fp_sub_lazy(diff, a.c0, a.c1);
fp_mont_mul(c0_tmp, sum, diff);
fp_mont_mul(prod, a.c0, a.c1);
@@ -242,7 +258,7 @@ __host__ __device__ void fp_inv_fermat(Fp &result, const Fp &a) {
if (found_first_bit || ((p_minus_2.limb[limb] >> bit) & 1)) {
found_first_bit = true;
Fp temp;
fp_mont_mul(temp, result_mont, result_mont);
fp_mont_sqr(temp, result_mont);
fp_copy(result_mont, temp);
if ((p_minus_2.limb[limb] >> bit) & 1) {
@@ -267,8 +283,8 @@ __host__ __device__ void fp2_inv(Fp2 &c, const Fp2 &a) {
// Compute norm = a0^2 + a1^2 in Montgomery form
Fp t0, t1, norm_m;
fp_mont_mul(t0, a0_m, a0_m);
fp_mont_mul(t1, a1_m, a1_m);
fp_mont_sqr(t0, a0_m);
fp_mont_sqr(t1, a1_m);
fp_add(norm_m, t0, t1);
// Convert norm to normal form for inversion, then back to Montgomery
@@ -295,8 +311,8 @@ __host__ __device__ void fp2_inv(Fp2 &c, const Fp2 &a) {
__host__ __device__ void fp2_mont_inv(Fp2 &c, const Fp2 &a) {
Fp t0, t1, norm, norm_inv;
fp_mont_mul(t0, a.c0, a.c0);
fp_mont_mul(t1, a.c1, a.c1);
fp_mont_sqr(t0, a.c0);
fp_mont_sqr(t1, a.c1);
fp_add(norm, t0, t1);
fp_mont_inv(norm_inv, norm);
fp_mont_mul(c.c0, a.c0, norm_inv);

View File

@@ -0,0 +1,174 @@
#include "fp.h"
#include "fp2.h"
#include "xyzz.h"
__host__ __device__ void xyzz_infinity(G1XYZZ &p) {
fp_zero(p.ZZ);
fp_zero(p.ZZZ);
}
__host__ __device__ bool xyzz_is_infinity(const G1XYZZ &p) {
return fp_is_zero(p.ZZ);
}
__host__ __device__ void xyzz_from_affine(G1XYZZ &xyzz,
const G1Affine &affine) {
xyzz.X = affine.x;
xyzz.Y = affine.y;
fp_one_montgomery(xyzz.ZZ);
fp_one_montgomery(xyzz.ZZZ);
}
__host__ __device__ void xyzz_mixed_add(G1XYZZ &acc, const G1Affine &p) {
if (p.infinity)
return;
if (xyzz_is_infinity(acc)) {
xyzz_from_affine(acc, p);
return;
}
// S2 = y2*ZZZ1, U2 = x2*ZZ1
Fp S2, U2;
fp_mont_mul(S2, p.y, acc.ZZZ);
fp_mont_mul(U2, p.x, acc.ZZ);
Fp P = U2 - acc.X; // P = U2 - X1
Fp R = S2 - acc.Y; // R = S2 - Y1
if (fp_is_zero(P)) {
if (fp_is_zero(R)) {
// U = 2*y2
// ZZ3 = V = U^2
// ZZZ3 = W = V*U
// S = x2*V
// M = 3*x2^2
// X3 = M^2 - 2*S
// Y3 = M*(S-X3) - W*y2
Fp U, S, M;
fp_double(U, p.y); // U = 2*y2
fp_mont_sqr(acc.ZZ, U); // ZZ3 = V = U^2
fp_mont_mul(acc.ZZZ, acc.ZZ, U); // ZZZ3 = W = V*U
fp_mont_mul(S, p.x, acc.ZZ); // S = x2*V
fp_mont_sqr(M, p.x); // x2^2
fp_mul3(M, M); // M = 3*x2^2
fp_mont_sqr(acc.X, M); // M^2
acc.X = acc.X - S - S; // X3 = M^2 - 2*S
fp_mont_mul(acc.Y, acc.ZZZ, p.y); // W*y2
Fp tmp = S - acc.X; // S - X3
fp_mont_mul(tmp, tmp, M); // M*(S-X3)
acc.Y = tmp - acc.Y; // Y3 = M*(S-X3) - W*y2
} else {
xyzz_infinity(acc);
}
return;
}
// General addition (P != 0): 8M + 2S
Fp PP, PPP, Q;
fp_mont_sqr(PP, P); // PP = P^2
fp_mont_mul(PPP, P, PP); // PPP = P*PP
fp_mont_mul(Q, acc.X, PP); // Q = X1*PP
fp_mont_mul(acc.ZZ, acc.ZZ, PP); // ZZ3 = ZZ1*PP
fp_mont_mul(acc.ZZZ, acc.ZZZ, PPP); // ZZZ3 = ZZZ1*PPP
Fp X3;
fp_mont_sqr(X3, R); // R^2
X3 = X3 - PPP - Q - Q; // X3 = R^2 - PPP - 2*Q
Fp QmX3 = Q - X3;
fp_mont_mul(QmX3, QmX3, R); // R*(Q-X3)
fp_mont_mul(acc.Y, acc.Y, PPP); // Y1*PPP
acc.Y = QmX3 - acc.Y; // Y3 = R*(Q-X3) - Y1*PPP
acc.X = X3;
}
__host__ __device__ void xyzz_to_projective(G1Projective &proj,
const G1XYZZ &xyzz) {
fp_mont_mul(proj.X, xyzz.X, xyzz.ZZZ);
fp_mont_mul(proj.Y, xyzz.Y, xyzz.ZZ);
fp_mont_mul(proj.Z, xyzz.ZZ, xyzz.ZZZ);
}
__host__ __device__ void xyzz_infinity(G2XYZZ &p) {
fp2_zero(p.ZZ);
fp2_zero(p.ZZZ);
}
__host__ __device__ bool xyzz_is_infinity(const G2XYZZ &p) {
return fp2_is_zero(p.ZZ);
}
__host__ __device__ void xyzz_from_affine(G2XYZZ &xyzz,
const G2Affine &affine) {
xyzz.X = affine.x;
xyzz.Y = affine.y;
// ZZ = ZZZ = 1 in Fp2 Montgomery form: (1_mont, 0)
fp_one_montgomery(xyzz.ZZ.c0);
fp_zero(xyzz.ZZ.c1);
fp_one_montgomery(xyzz.ZZZ.c0);
fp_zero(xyzz.ZZZ.c1);
}
__host__ __device__ void xyzz_mixed_add(G2XYZZ &acc, const G2Affine &p) {
if (p.infinity)
return;
if (xyzz_is_infinity(acc)) {
xyzz_from_affine(acc, p);
return;
}
Fp2 S2, U2;
fp2_mont_mul(S2, p.y, acc.ZZZ); // S2 = y2*ZZZ1
fp2_mont_mul(U2, p.x, acc.ZZ); // U2 = x2*ZZ1
Fp2 P = U2 - acc.X;
Fp2 R = S2 - acc.Y;
if (fp2_is_zero(P)) {
if (fp2_is_zero(R)) {
Fp2 U, S, M;
fp2_double(U, p.y);
fp2_mont_square(acc.ZZ, U); // ZZ3 = V = U^2
fp2_mont_mul(acc.ZZZ, acc.ZZ, U); // ZZZ3 = W = V*U
fp2_mont_mul(S, p.x, acc.ZZ); // S = x2*V
fp2_mont_square(M, p.x); // x2^2
fp2_mul3(M, M); // M = 3*x2^2
fp2_mont_square(acc.X, M); // M^2
acc.X = acc.X - S - S; // X3 = M^2 - 2*S
fp2_mont_mul(acc.Y, acc.ZZZ, p.y); // W*y2
Fp2 tmp = S - acc.X;
fp2_mont_mul(tmp, tmp, M); // M*(S-X3)
acc.Y = tmp - acc.Y; // Y3 = M*(S-X3)-W*y2
} else {
xyzz_infinity(acc);
}
return;
}
// General addition (8M_Fp2 + 2S_Fp2)
Fp2 PP, PPP, Q;
fp2_mont_square(PP, P); // PP = P^2
fp2_mont_mul(PPP, P, PP); // PPP = P*PP
fp2_mont_mul(Q, acc.X, PP); // Q = X1*PP
fp2_mont_mul(acc.ZZ, acc.ZZ, PP); // ZZ3 = ZZ1*PP
fp2_mont_mul(acc.ZZZ, acc.ZZZ, PPP); // ZZZ3 = ZZZ1*PPP
Fp2 X3;
fp2_mont_square(X3, R); // R^2
X3 = X3 - PPP - Q - Q; // X3 = R^2 - PPP - 2*Q
Fp2 QmX3 = Q - X3;
fp2_mont_mul(QmX3, QmX3, R); // R*(Q-X3)
fp2_mont_mul(acc.Y, acc.Y, PPP); // Y1*PPP
acc.Y = QmX3 - acc.Y; // Y3 = R*(Q-X3) - Y1*PPP
acc.X = X3;
}
__host__ __device__ void xyzz_to_projective(G2Projective &proj,
const G2XYZZ &xyzz) {
fp2_mont_mul(proj.X, xyzz.X, xyzz.ZZZ);
fp2_mont_mul(proj.Y, xyzz.Y, xyzz.ZZ);
fp2_mont_mul(proj.Z, xyzz.ZZ, xyzz.ZZZ);
}

View File

@@ -11,7 +11,7 @@
#include <stddef.h>
#include <cstring>
#include "helper_profile.cuh"
#include "../../tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh"
// C++ helper functions (not exported, used internally)
// These can call template functions since they have C++ linkage

View File

@@ -16,14 +16,19 @@ tfhe = { path = "../../tfhe", features = ["hpu", "hpu-debug"] }
ipc-channel = "0.18.3"
strum = { version = "0.26.2", features = ["derive"] }
strum_macros = "0.26.2"
bytemuck = { workspace = true }
clap = { version = "4.4.4", features = ["derive"] }
clap-num = "*"
anyhow = "1.0.82"
tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["env-filter", "json"] }
serde_json = "1.0"
rand = "0.8.5"
serde = { version = "1", features = ["derive"] }
bitflags = "2.6.0"
[[bin]]
name = "hpu_mockup"

View File

@@ -1 +1 @@
nightly-2026-04-22
nightly-2026-01-14

View File

@@ -25,7 +25,7 @@ use tfhe::{
CompressedKVStore, CompressedPublicKey, CompressedServerKey,
CompressedSquashedNoiseCiphertextList, CompressedSquashedNoiseCiphertextListBuilder, FheBool,
FheInt8, FheUint32, FheUint64, FheUint8, ReRandomizationContext, ReRandomizationMode,
ReRandomizationSupport, Seed, ServerKey, SquashedNoiseFheBool, SquashedNoiseFheInt,
ReRandomizationSupport, ServerKey, SquashedNoiseFheBool, SquashedNoiseFheInt,
SquashedNoiseFheUint,
};
use tfhe_backward_compat_data::load::{
@@ -748,22 +748,6 @@ fn test_hl_key_features(
}
}
// OPRF: check that oblivious pseudo-random generation works with the dedicated key.
// The decrypted values only need to be within range; the seed is deterministic but we
// don't compare to specific bit values (those are validated in the unit tests).
if server_key.supports_oprf() {
let seed = Seed(42u128);
let rand_bool = FheBool::generate_oblivious_pseudo_random(seed);
let _: bool = rand_bool.decrypt(client_key);
let rand_uint = FheUint8::generate_oblivious_pseudo_random(seed);
let _: u8 = rand_uint.decrypt(client_key);
let rand_int = FheInt8::generate_oblivious_pseudo_random(seed);
let _: i8 = rand_int.decrypt(client_key);
}
Ok(())
}

View File

@@ -54,7 +54,6 @@ internal-keycache = ["tfhe/internal-keycache"]
avx512 = ["tfhe/avx512"]
pbs-stats = ["tfhe/pbs-stats"]
zk-pok = ["tfhe/zk-pok", "dep:tfhe-zk-pok"]
experimental = ["tfhe/experimental"]
[[bench]]
name = "boolean"
@@ -231,9 +230,3 @@ required-features = ["integer", "internal-keycache"]
name = "wasm_benchmarks_parser"
path = "src/bin/wasm_benchmarks_parser.rs"
required-features = ["shortint", "internal-keycache"]
[[bench]]
name = "cm-bench"
path = "benches/core_crypto/cm_bench.rs"
harness = false
required-features = ["experimental"]

View File

@@ -1,211 +0,0 @@
use cm_fft64::programmable_bootstrap_cm_lwe_ciphertext;
use criterion::{black_box, criterion_main, Criterion};
use tfhe::core_crypto::experimental::prelude::cm_lwe_keyswitch_key_generation::allocate_and_generate_new_cm_lwe_keyswitch_key;
use tfhe::core_crypto::experimental::prelude::cm_modulus_switch_noise_reduction::improve_lwe_ciphertext_modulus_switch_noise_for_binary_key_cm;
use tfhe::core_crypto::experimental::prelude::*;
use tfhe::core_crypto::prelude::*;
fn cm_bench(c: &mut Criterion) {
let bench_cm_params_2_minus_64: Vec<CmApParams> = vec![
CM_PARAM_2_2_MINUS_64,
CM_PARAM_4_2_MINUS_64,
CM_PARAM_6_2_MINUS_64,
CM_PARAM_8_2_MINUS_64,
CM_PARAM_10_2_MINUS_64,
CM_PARAM_2_4_MINUS_64,
CM_PARAM_4_4_MINUS_64,
CM_PARAM_6_4_MINUS_64,
CM_PARAM_8_4_MINUS_64,
CM_PARAM_10_4_MINUS_64,
CM_PARAM_2_6_MINUS_64,
CM_PARAM_4_6_MINUS_64,
CM_PARAM_6_6_MINUS_64,
CM_PARAM_8_6_MINUS_64,
CM_PARAM_10_6_MINUS_64,
CM_PARAM_2_8_MINUS_64,
CM_PARAM_4_8_MINUS_64,
CM_PARAM_6_8_MINUS_64,
CM_PARAM_8_8_MINUS_64,
CM_PARAM_10_8_MINUS_64,
];
cm_bench_for_pfail(c, &bench_cm_params_2_minus_64, "2^-64");
let bench_cm_params_2_minus_128: Vec<CmApParams> = vec![
CM_PARAM_2_2_MINUS_128,
CM_PARAM_4_2_MINUS_128,
CM_PARAM_6_2_MINUS_128,
CM_PARAM_8_2_MINUS_128,
CM_PARAM_10_2_MINUS_128,
CM_PARAM_2_4_MINUS_128,
CM_PARAM_4_4_MINUS_128,
CM_PARAM_6_4_MINUS_128,
CM_PARAM_8_4_MINUS_128,
CM_PARAM_10_4_MINUS_128,
CM_PARAM_2_6_MINUS_128,
CM_PARAM_4_6_MINUS_128,
CM_PARAM_6_6_MINUS_128,
CM_PARAM_8_6_MINUS_128,
CM_PARAM_10_6_MINUS_128,
CM_PARAM_2_8_MINUS_128,
CM_PARAM_4_8_MINUS_128,
CM_PARAM_6_8_MINUS_128,
CM_PARAM_8_8_MINUS_128,
CM_PARAM_10_8_MINUS_128,
];
cm_bench_for_pfail(c, &bench_cm_params_2_minus_128, "2^-128");
}
fn cm_bench_for_pfail(c: &mut Criterion, bench_cm_params: &[CmApParams], p_fail: &str) {
let mut bench_group = c.benchmark_group("Common Mask Benchmarks");
bench_group.sample_size(10);
// Create the PRNG
let mut seeder = new_seeder();
let seeder = seeder.as_mut();
let mut encryption_generator =
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
for cm_param in bench_cm_params {
let cm_dimension = cm_param.cm_dimension;
let ciphertext_modulus = cm_param.ciphertext_modulus;
let bench_name = format!(
"KS-CM-PBS_p={}_w={}_pfail={p_fail}",
cm_param.precision, cm_dimension.0,
);
let lwe_noise_distribution = cm_param.lwe_noise_distribution;
assert_eq!(
cm_param.ciphertext_modulus,
CiphertextModulus::<u64>::new_native()
);
let encoding_with_padding = 1 << 63;
let glwe_dimension = cm_param.glwe_dimension;
let polynomial_size = cm_param.polynomial_size;
let msg_modulus = 1u64 << cm_param.precision;
let delta = encoding_with_padding / msg_modulus;
let f = |x| x;
let accumulator = cm_generate_programmable_bootstrap_glwe_lut(
polynomial_size,
glwe_dimension,
cm_dimension,
msg_modulus.cast_into(),
cm_param.ciphertext_modulus,
delta,
f,
);
let CmBootstrapKeys {
small_lwe_sk,
big_lwe_sk,
bsk,
fbsk,
} = generate_cm_pbs_keys(cm_param, &mut encryption_generator, &mut secret_generator);
drop(bsk);
let cm_lwe_keyswitch_key = allocate_and_generate_new_cm_lwe_keyswitch_key(
&big_lwe_sk,
&small_lwe_sk,
cm_dimension,
cm_param.base_log_ks,
cm_param.level_ks,
lwe_noise_distribution,
ciphertext_modulus,
&mut encryption_generator,
);
let plaintexts = PlaintextList::from_container(vec![0_u64; cm_dimension.0]);
let ct_in = allocate_and_encrypt_new_cm_lwe_ciphertext(
&big_lwe_sk,
&plaintexts,
lwe_noise_distribution,
ciphertext_modulus,
&mut encryption_generator,
);
let mut ct_after_ks = CmLweCiphertext::new(
0u64,
cm_lwe_keyswitch_key.output_lwe_dimension(),
cm_dimension,
ciphertext_modulus,
);
let mut ct_out = CmLweCiphertext::new(
0u64,
fbsk.output_lwe_dimension(),
cm_dimension,
ciphertext_modulus,
);
let max_nb_zeros_n = cm_param.max_nb_zeros_n.ceil() as usize;
let mut encryptions_of_zero = CmLweCiphertextList::new(
0,
cm_param.lwe_dimension,
cm_dimension,
CmLweCiphertextCount(max_nb_zeros_n),
ciphertext_modulus,
);
let plaintext_list = PlaintextList::new(0, PlaintextCount(cm_dimension.0));
let plaintext_lists: Vec<_> = (0..max_nb_zeros_n)
.map(|_| plaintext_list.clone())
.collect();
encrypt_cm_lwe_ciphertext_list(
&small_lwe_sk,
&mut encryptions_of_zero,
&plaintext_lists,
lwe_noise_distribution,
&mut encryption_generator,
);
let log_modulus = polynomial_size.to_blind_rotation_input_modulus_log();
{
bench_group.bench_function(&bench_name, |b| {
b.iter(|| {
cm_keyswitch_lwe_ciphertext(&cm_lwe_keyswitch_key, &ct_in, &mut ct_after_ks);
improve_lwe_ciphertext_modulus_switch_noise_for_binary_key_cm(
&mut ct_after_ks,
&encryptions_of_zero,
cm_param.r_sigma_factor_n,
cm_param.ms_bound_n,
cm_param.ms_input_variance_n,
log_modulus,
);
programmable_bootstrap_cm_lwe_ciphertext(
&ct_after_ks,
&mut ct_out,
&accumulator.as_view(),
&fbsk,
);
black_box(&mut ct_out);
})
});
}
}
bench_group.finish();
}
pub fn cm_group() {
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
cm_bench(&mut criterion);
}
criterion_main!(cm_group);

View File

@@ -8,7 +8,6 @@ use rayon::prelude::*;
#[cfg(any(feature = "gpu", feature = "hpu"))]
use std::cmp::max;
use tfhe::integer::keycache::KEY_CACHE;
use tfhe::integer::oprf::{OprfPrivateKey, OprfServerKey};
use tfhe::integer::IntegerKeyKind;
use tfhe::keycache::NamedParam;
#[cfg(any(feature = "gpu", feature = "hpu"))]
@@ -36,42 +35,32 @@ pub fn unsigned_oprf(c: &mut Criterion) {
format!("{bench_name}_bounded::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id_oprf, |b| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let oprf_pk = OprfPrivateKey::new(&cks);
let oprf_sk = OprfServerKey::new(&oprf_pk, &cks).unwrap();
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
b.iter(|| {
_ = black_box(
oprf_sk.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
&sks,
),
);
_ = black_box(sk.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
));
})
});
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let oprf_pk = OprfPrivateKey::new(&cks);
let oprf_sk = OprfServerKey::new(&oprf_pk, &cks).unwrap();
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
b.iter(|| {
_ = black_box(
oprf_sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&sks,
),
);
})
});
}
BenchmarkType::Throughput => {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let oprf_pk = OprfPrivateKey::new(&cks);
let oprf_sk = OprfServerKey::new(&oprf_pk, &cks).unwrap();
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
bench_id_oprf = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_id_oprf_bounded =
@@ -82,11 +71,10 @@ pub fn unsigned_oprf(c: &mut Criterion) {
{
// Execute the operation once to know its cost.
reset_pbs_count();
oprf_sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&sks,
);
let pbs_count = max(get_pbs_count(), 1);
throughput_num_threads(num_block, pbs_count)
@@ -97,13 +85,11 @@ pub fn unsigned_oprf(c: &mut Criterion) {
let setup = |_batch_size: usize| ();
let run = |_: &mut (), batch_size: usize| {
(0..batch_size).into_par_iter().for_each(|_| {
oprf_sk
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&sks,
);
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
);
});
};
find_optimal_batch(run, setup) as u64
@@ -114,10 +100,9 @@ pub fn unsigned_oprf(c: &mut Criterion) {
bench_group.bench_function(&bench_id_oprf, |b| {
b.iter(|| {
(0..elements).into_par_iter().for_each(|_| {
oprf_sk.par_generate_oblivious_pseudo_random_unsigned_integer(
sk.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
&sks,
);
})
})
@@ -126,11 +111,10 @@ pub fn unsigned_oprf(c: &mut Criterion) {
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
b.iter(|| {
(0..elements).into_par_iter().for_each(|_| {
oprf_sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&sks,
);
})
})
@@ -164,8 +148,6 @@ pub mod cuda {
use criterion::black_box;
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
use tfhe::integer::gpu::server_key::CudaServerKey;
use tfhe::integer::gpu::CudaOprfServerKey;
use tfhe::integer::oprf::{CompressedOprfServerKey, OprfPrivateKey};
use tfhe::GpuIndex;
use tfhe_csprng::seeders::Seed;
@@ -195,18 +177,12 @@ pub mod cuda {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
let oprf_pk = OprfPrivateKey::new(&cks);
let compressed_oprf_sk =
CompressedOprfServerKey::new(&oprf_pk, &cks).unwrap();
let cuda_oprf_sk =
CudaOprfServerKey::decompress_from_cpu(&compressed_oprf_sk, &streams);
b.iter(|| {
_ = black_box(
cuda_oprf_sk.par_generate_oblivious_pseudo_random_unsigned_integer(
gpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
&gpu_sks,
&streams,
),
);
@@ -217,20 +193,14 @@ pub mod cuda {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
let oprf_pk = OprfPrivateKey::new(&cks);
let compressed_oprf_sk =
CompressedOprfServerKey::new(&oprf_pk, &cks).unwrap();
let cuda_oprf_sk =
CudaOprfServerKey::decompress_from_cpu(&compressed_oprf_sk, &streams);
b.iter(|| {
_ = black_box(
cuda_oprf_sk
gpu_sks
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&gpu_sks,
&streams,
),
);
@@ -240,25 +210,13 @@ pub mod cuda {
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks_vec = cuda_local_keys(&cks);
let cpu_oprf_pk = OprfPrivateKey::new(&cks);
let cpu_oprf_sk = OprfServerKey::new(&cpu_oprf_pk, &cks).unwrap();
let compressed_oprf_sk =
CompressedOprfServerKey::new(&cpu_oprf_pk, &cks).unwrap();
// One CudaOprfServerKey per GPU, matching `gpu_sks_vec`.
let cuda_oprf_sks_vec: Vec<CudaOprfServerKey> = (0..get_number_of_gpus())
.map(|gpu_index| {
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
CudaOprfServerKey::decompress_from_cpu(&compressed_oprf_sk, &stream)
})
.collect();
// Execute the operation once to know its cost.
reset_pbs_count();
cpu_oprf_sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
cpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&cpu_sks,
);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
@@ -274,11 +232,10 @@ pub mod cuda {
(0..elements).into_par_iter().for_each(|i| {
let gpu_index: u32 = i as u32 % get_number_of_gpus();
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
cuda_oprf_sks_vec[gpu_index as usize]
gpu_sks_vec[gpu_index as usize]
.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
&gpu_sks_vec[gpu_index as usize],
&stream,
);
})
@@ -290,12 +247,11 @@ pub mod cuda {
(0..elements).into_par_iter().for_each(|i| {
let gpu_index: u32 = i as u32 % get_number_of_gpus();
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
cuda_oprf_sks_vec[gpu_index as usize]
gpu_sks_vec[gpu_index as usize]
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&gpu_sks_vec[gpu_index as usize],
&stream,
);
})

View File

@@ -2,7 +2,6 @@ use benchmark::params_aliases::*;
use criterion::{black_box, criterion_group, Criterion};
use tfhe::keycache::NamedParam;
use tfhe::shortint::keycache::KEY_CACHE;
use tfhe::shortint::oprf::{OprfPrivateKey, OprfServerKey};
use tfhe_csprng::seeders::Seed;
fn oprf(c: &mut Criterion) {
@@ -13,15 +12,11 @@ fn oprf(c: &mut Criterion) {
let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS;
let keys = KEY_CACHE.get_from_param(param);
let cks = keys.client_key();
let sks = keys.server_key();
let oprf_pk = OprfPrivateKey::new(cks);
let oprf_sk = OprfServerKey::new(&oprf_pk, cks).unwrap();
bench_group.bench_function(format!("2-bits-oprf::{}", param.name()), |b| {
b.iter(|| {
_ = black_box(oprf_sk.generate_oblivious_pseudo_random(Seed(0), 2, sks));
_ = black_box(sks.generate_oblivious_pseudo_random(Seed(0), 2));
})
});
}

View File

@@ -168,7 +168,7 @@ mod generic_tests {
fn test_xof_seed_getters() {
let seed_bytes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
let bits = u128::from_le_bytes(seed_bytes);
let dsep = *b"tfheksps";
let dsep = [b't', b'f', b'h', b'e', b'k', b's', b'p', b's'];
let seed = XofSeed::new_u128(bits, dsep);
let s = u128::from_le_bytes(seed.seed().try_into().unwrap());

View File

@@ -342,28 +342,6 @@ impl<G: Curve> Proof<G> {
None => ComputeLoad::Verify,
}
}
pub fn to_le_bytes(&self) -> Vec<u8> {
let mut bytes = Vec::new();
let Self {
c_hat,
c_y,
pi,
compute_load_proof_fields,
} = self;
bytes.extend_from_slice(c_hat.to_le_bytes().as_ref());
bytes.extend_from_slice(c_y.to_le_bytes().as_ref());
bytes.extend_from_slice(pi.to_le_bytes().as_ref());
let (c_hat_t_bytes, c_h_bytes, pi_kzg_bytes) =
ComputeLoadProofFields::to_le_bytes(compute_load_proof_fields);
bytes.extend_from_slice(&c_hat_t_bytes);
bytes.extend_from_slice(&c_h_bytes);
bytes.extend_from_slice(&pi_kzg_bytes);
bytes
}
}
impl<G: Curve> ParameterSetConformant for Proof<G> {
@@ -426,26 +404,6 @@ pub(crate) struct ComputeLoadProofFields<G: Curve> {
pub(crate) pi_kzg: G::G1,
}
impl<G: Curve> ComputeLoadProofFields<G> {
#[allow(clippy::type_complexity)]
fn to_le_bytes(fields: &Option<Self>) -> (Box<[u8]>, Box<[u8]>, Box<[u8]>) {
if let Some(ComputeLoadProofFields {
c_hat_t,
c_h,
pi_kzg,
}) = fields.as_ref()
{
(
Box::from(G::G2::to_le_bytes(*c_hat_t).as_ref()),
Box::from(G::G1::to_le_bytes(*c_h).as_ref()),
Box::from(G::G1::to_le_bytes(*pi_kzg).as_ref()),
)
} else {
(Box::from([]), Box::from([]), Box::from([]))
}
}
}
type CompressedG2<G> = <<G as Curve>::G2 as Compressible>::Compressed;
type CompressedG1<G> = <<G as Curve>::G1 as Compressible>::Compressed;

View File

@@ -440,44 +440,6 @@ impl<G: Curve> Proof<G> {
pub fn hash_config(&self) -> PkeV2SupportedHashConfig {
self.hash_config
}
pub fn to_le_bytes(&self) -> Vec<u8> {
let mut bytes = Vec::new();
let Self {
C_hat_e,
C_e,
C_r_tilde,
C_R,
C_hat_bin,
C_y,
C_h1,
C_h2,
C_hat_t,
pi,
pi_kzg,
compute_load_proof_fields,
hash_config: _,
} = self;
bytes.extend_from_slice(C_hat_e.to_le_bytes().as_ref());
bytes.extend_from_slice(C_e.to_le_bytes().as_ref());
bytes.extend_from_slice(C_r_tilde.to_le_bytes().as_ref());
bytes.extend_from_slice(C_R.to_le_bytes().as_ref());
bytes.extend_from_slice(C_hat_bin.to_le_bytes().as_ref());
bytes.extend_from_slice(C_y.to_le_bytes().as_ref());
bytes.extend_from_slice(C_h1.to_le_bytes().as_ref());
bytes.extend_from_slice(C_h2.to_le_bytes().as_ref());
bytes.extend_from_slice(C_hat_t.to_le_bytes().as_ref());
bytes.extend_from_slice(pi.to_le_bytes().as_ref());
bytes.extend_from_slice(pi_kzg.to_le_bytes().as_ref());
let (C_hat_h3_bytes, C_hat_w_bytes) =
ComputeLoadProofFields::to_le_bytes(compute_load_proof_fields);
bytes.extend_from_slice(&C_hat_h3_bytes);
bytes.extend_from_slice(&C_hat_w_bytes);
bytes
}
}
/// These fields can be pre-computed on the prover side in the faster Verifier scheme. If that's the

View File

@@ -64,7 +64,7 @@ tfhe-fft = { version = "0.10.1", path = "../tfhe-fft", features = [
"serde",
"fft128",
] }
tfhe-ntt = { version = "0.7.1", path = "../tfhe-ntt" }
tfhe-ntt = { version = "0.7.0", path = "../tfhe-ntt" }
pulp = { workspace = true, features = ["default"] }
tfhe-cuda-backend = { version = "0.14.0", path = "../backends/tfhe-cuda-backend", optional = true }
aligned-vec = { workspace = true, features = ["default", "serde"] }
@@ -99,7 +99,7 @@ serde-wasm-bindgen = { workspace = true, optional = true }
getrandom = { workspace = true, optional = true }
bytemuck = { workspace = true }
tfhe-hpu-backend = { version = "0.5", path = "../backends/tfhe-hpu-backend", optional = true }
tfhe-hpu-backend = { version = "0.4", path = "../backends/tfhe-hpu-backend", optional = true }
[features]
default = ["avx512"]

View File

@@ -8,23 +8,23 @@
<rect x="0" y="40" width="300" height="520" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="520" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">Negation (-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">77.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">71.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">9.08 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">8.4 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">Add / Sub (+,-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">91.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">93.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">9.07 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">8.35 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">Mul (x)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">357 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">352 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">32.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">122 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="180.0">Equal / Not Equal (eq, ne)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="180.0">72.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="180.0">70.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">7.03 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="180.0">6.77 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="220.0">Comparisons (ge, gt, le, lt)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="220.0">89.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="220.0">87.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">10.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="220.0">6.81 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="260.0">Max / Min (max, min)</text>
@@ -32,31 +32,31 @@
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="260.0">15.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="260.0">11.7 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="300.0">Bitwise operations (&amp;, |, ^)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="300.0">19.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="300.0">19.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">1.99 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="300.0">2.95 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="340.0">Div / Rem (/, %)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="340.0">4.88 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="340.0">5.04 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">514 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="340.0">912 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="380.0">Left / Right Shifts (&lt;&lt;, &gt;&gt;)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="380.0">121 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="380.0">119 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">18.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="380.0">25.8 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="420.0">Left / Right Rotations (left_rotate, right_rotate)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="420.0">121 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="420.0">119 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">18.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="420.0">27.9 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="460.0">Leading / Trailing zeros/ones</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="460.0">222 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="460.0">223 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="460.0">20.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="460.0">14.7 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="500.0">Log2</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="500.0">246 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="500.0">244 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="500.0">21.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="500.0">14.8 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="540.0">Select</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="540.0">40.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="540.0">39.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="540.0">4.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="540.0">5.53 ms</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>

Before

Width:  |  Height:  |  Size: 10 KiB

After

Width:  |  Height:  |  Size: 10 KiB

View File

@@ -7,13 +7,13 @@
<rect x="0" y="40" width="300" height="120" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="120" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">whitepaper</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="405.0" y="60.0">253 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="615.0" y="60.0">25.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="405.0" y="60.0">276 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="615.0" y="60.0">23.0 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">no_cmux</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="405.0" y="100.0">256 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="615.0" y="100.0">25.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="405.0" y="100.0">238 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="615.0" y="100.0">24.0 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">overflow</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="405.0" y="140.0">238 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="405.0" y="140.0">225 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="615.0" y="140.0">21.3 ops/s</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>

Before

Width:  |  Height:  |  Size: 2.8 KiB

After

Width:  |  Height:  |  Size: 2.8 KiB

View File

@@ -15,83 +15,83 @@
<rect x="0" y="40" width="300" height="520" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="520" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">Negation (-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">50.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">55.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">57.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">77.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">96.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">52.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">55.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">54.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">76.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">96.0 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">Add / Sub (+,-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">50.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="100.0">55.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">74.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">91.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">150 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">50.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="100.0">55.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">75.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">96.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">145 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">Mul (x)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">89.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">89.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="140.0">131 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">195 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">357 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">1.02 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">363 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">1.01 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="180.0">Equal / Not Equal (eq, ne)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">33.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">52.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">52.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="180.0">72.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">72.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">33.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">50.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">51.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="180.0">71.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">72.0 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="220.0">Comparisons (ge, gt, le, lt)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">32.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">52.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">70.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">89.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">34.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">50.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">70.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">88.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="220.0">128 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="260.0">Max / Min (max, min)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">69.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="260.0">88.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">70.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="260.0">88.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="260.0">109 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">128 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">173 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">131 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">168 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="300.0">Bitwise operations (&amp;, |, ^)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">17.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">18.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">19.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">19.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">19.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">17.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">18.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">18.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">18.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">20.2 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="340.0">Div / Rem (/, %)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">460 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">1.01 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">2.22 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">4.88 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">12.6 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">457 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">1.0 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">2.2 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">4.99 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">12.5 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="380.0">Left / Right Shifts (&lt;&lt;, &gt;&gt;)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">53.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">74.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">97.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">121 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">158 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">54.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">75.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">97.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">122 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">150 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="420.0">Left / Right Rotations (left_rotate, right_rotate)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">54.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="420.0">75.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">94.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">121 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="420.0">165 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">53.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="420.0">75.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">96.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">116 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="420.0">164 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="460.0">Leading / Trailing zeros/ones</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="460.0">67.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="460.0">70.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="460.0">89.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="460.0">92.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="460.0">113 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="460.0">86.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="460.0">140 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="460.0">164 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="460.0">220 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="460.0">264 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="500.0">Log2</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="500.0">110 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="500.0">163 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="500.0">186 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="500.0">246 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="500.0">290 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="500.0">103 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="500.0">159 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="500.0">183 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="500.0">236 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="500.0">279 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="540.0">Select</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="540.0">36.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="540.0">36.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="540.0">38.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="540.0">40.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="540.0">43.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="540.0">35.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="540.0">37.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="540.0">36.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="540.0">39.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="540.0">42.0 ms</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -15,65 +15,65 @@
<rect x="0" y="40" width="300" height="400" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="400" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">Add / Sub (+,-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">53.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">55.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">57.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">78.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">99.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">50.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">54.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">54.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">76.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">95.2 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">Mul (x)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">71.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">70.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="100.0">115 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">155 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">207 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">422 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">156 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">208 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">412 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">Equal / Not Equal (eq, ne)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">34.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="140.0">33.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">52.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">52.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">72.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">33.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="140.0">33.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">52.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">53.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">71.2 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="180.0">Comparisons (ge, gt, le, lt)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">38.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">34.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">54.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">31.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">34.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">51.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="180.0">70.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">90.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">90.1 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="220.0">Max / Min (max, min)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">54.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">53.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">71.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">91.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="220.0">110 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">52.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">52.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">71.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">91.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="220.0">108 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="260.0">Bitwise operations (&amp;, |, ^)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">17.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">17.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="260.0">18.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="260.0">19.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">19.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">19.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">19.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">20.7 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="300.0">Div (/)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">136 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">172 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">245 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">437 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">792 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">126 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">182 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">234 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">427 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">799 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="340.0">Rem (%)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">235 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">337 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">468 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">690 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">1.27 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">244 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">334 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">462 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">657 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">1.19 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="380.0">Left / Right Shifts (&lt;&lt;, &gt;&gt;)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">17.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">18.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">19.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">19.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">21.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">17.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">18.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">19.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">19.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">19.8 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="420.0">Left / Right Rotations (left_rotate, right_rotate)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">18.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="420.0">18.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">19.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">19.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="420.0">20.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">17.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="420.0">18.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">18.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">20.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="420.0">21.0 ms</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 13 KiB

View File

@@ -15,83 +15,83 @@
<rect x="0" y="40" width="300" height="520" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="520" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">Negation (-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">804 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">372 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">181 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">86.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">42.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">824 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">388 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">184 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">88.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">42.8 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">Add / Sub (+,-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">733 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="100.0">356 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">167 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">82.6 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">40.0 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">752 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="100.0">368 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">172 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">82.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">39.5 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">Mul (x)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">293 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="140.0">71.9 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">18.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">4.58 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">1.19 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">283 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="140.0">65.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">17.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">4.68 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">1.17 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="180.0">Equal / Not Equal (eq, ne)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">1.6 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">740 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">392 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="180.0">200 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">101 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">1.65 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">748 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">391 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="180.0">195 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">102 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="220.0">Comparisons (ge, gt, le, lt)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">1.58 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">733 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">354 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">171 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="220.0">64.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">1.62 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">745 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">355 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">170 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="220.0">65.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="260.0">Max / Min (max, min)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">493 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="260.0">236 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="260.0">116 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">58.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">25.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">488 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="260.0">239 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="260.0">117 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">57.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">25.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="300.0">Bitwise operations (&amp;, |, ^)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">2.1 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">981 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">490 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">262 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">130 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">2.14 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">1.06 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">537 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">270 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">136 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="340.0">Div / Rem (/, %)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">45.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">12.9 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">3.56 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">0.893 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">0.223 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">42.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">12.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">3.51 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">0.914 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">0.143 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="380.0">Left / Right Shifts (&lt;&lt;, &gt;&gt;)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">464 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">183 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">76.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">32.4 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">14.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">469 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">182 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">74.5 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">32.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">14.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="420.0">Left / Right Rotations (left_rotate, right_rotate)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">391 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">397 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="420.0">170 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">74.0 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">32.5 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">72.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">32.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="420.0">14.0 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="460.0">Leading / Trailing zeros/ones</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="460.0">824 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="460.0">487 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="460.0">222 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="460.0">119 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="460.0">57.8 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="460.0">621 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="460.0">235 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="460.0">104 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="460.0">41.8 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="460.0">17.8 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="500.0">Log2</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="500.0">542 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="500.0">220 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="500.0">102 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="500.0">42.0 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="500.0">18.6 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="500.0">536 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="500.0">207 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="500.0">96.4 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="500.0">40.4 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="500.0">17.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="540.0">Select</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="540.0">676 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="540.0">350 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="540.0">176 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="540.0">84.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="540.0">42.6 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="540.0">699 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="540.0">351 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="540.0">175 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="540.0">87.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="540.0">43.4 ops/s</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -15,65 +15,65 @@
<rect x="0" y="40" width="300" height="400" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="400" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">Add / Sub (+,-)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">810 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">379 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">178 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">86.0 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">41.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="60.0">836 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="60.0">383 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">184 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="60.0">87.9 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="60.0">42.5 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">Mul (x)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">658 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="100.0">185 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">57.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">17.6 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">4.83 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="100.0">659 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="100.0">182 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">52.8 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="100.0">16.5 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="100.0">4.79 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">Equal / Not Equal (eq, ne)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">2.69 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="140.0">1.57 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">723 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">378 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">192 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="140.0">2.73 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="140.0">1.68 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">757 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="140.0">399 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="140.0">198 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="180.0">Comparisons (ge, gt, le, lt)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">2.61 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">1.63 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">717 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="180.0">348 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">172 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="180.0">2.82 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="180.0">1.64 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="180.0">747 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="180.0">356 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="180.0">173 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="220.0">Max / Min (max, min)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">1.15 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">621 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">302 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">148 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="220.0">73.6 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="220.0">1.18 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="220.0">645 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="220.0">305 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="220.0">150 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="220.0">73.2 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="260.0">Bitwise operations (&amp;, |, ^)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">2.11 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="260.0">1.04 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="260.0">516 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">260 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">128 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="260.0">2.31 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="260.0">1.12 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="260.0">555 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="260.0">276 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="260.0">139 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="300.0">Div (/)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">203 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">73.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">24.8 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">7.38 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">2.16 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="300.0">196 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="300.0">69.6 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="300.0">23.7 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="300.0">7.63 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="300.0">2.13 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="340.0">Rem (%)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">130 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">49.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">17.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">5.65 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">1.75 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="340.0">114 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="340.0">44.5 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="340.0">16.6 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="340.0">5.78 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="340.0">1.66 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="380.0">Left / Right Shifts (&lt;&lt;, &gt;&gt;)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">2.01 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">1.02 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">510 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">247 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">124 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="380.0">2.13 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="380.0">1.07 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="380.0">546 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="380.0">270 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="380.0">138 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="420.0">Left / Right Rotations (left_rotate, right_rotate)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">2.01 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="420.0">992 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">517 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">254 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="420.0">124 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="342.0" y="420.0">2.14 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="426.0" y="420.0">1.07 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="420.0">541 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="594.0" y="420.0">270 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="678.0" y="420.0">137 ops/s</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 13 KiB

View File

@@ -9,25 +9,25 @@
<rect x="0" y="40" width="300" height="160" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="160" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="60.0">9.57 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="60.0">12.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="60.0">112 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="60.0">1.58 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="60.0">9.54 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="60.0">12.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="60.0">111 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="60.0">1.39 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">MB-PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="100.0">4.42 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="100.0">4.71 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="100.0">30.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="100.0">257 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="100.0">4.02 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="100.0">4.55 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="100.0">30.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="100.0">244 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">KS - PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="140.0">11.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="140.0">15.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="140.0">126 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="140.0">1.58 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="140.0">10.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="140.0">15.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="140.0">125 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="140.0">1.51 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="180.0">KS - MB-PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="180.0">6.67 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="180.0">8.49 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="180.0">46.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="180.0">388 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="180.0">5.56 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="180.0">7.29 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="180.0">61.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="180.0">418 ms</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 5.0 KiB

After

Width:  |  Height:  |  Size: 5.0 KiB

View File

@@ -9,25 +9,25 @@
<rect x="0" y="40" width="300" height="160" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="160" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="60.0">8.93 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="60.0">8.94 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="60.0">11.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="60.0">102 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="60.0">654 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="60.0">104 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="60.0">670 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">MB-PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="100.0">4.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="100.0">4.58 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="100.0">28.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="100.0">214 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="100.0">4.87 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="100.0">4.53 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="100.0">30.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="100.0">185 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">KS - PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="140.0">10.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="140.0">14.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="140.0">119 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="140.0">865 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="140.0">10.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="140.0">15.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="140.0">120 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="140.0">871 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="180.0">KS - MB-PBS</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="180.0">6.96 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="180.0">7.59 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="180.0">47.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="180.0">247 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="352.5" y="180.0">6.83 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="457.5" y="180.0">7.13 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="562.5" y="180.0">44.7 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="667.5" y="180.0">228 ms</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 5.0 KiB

After

Width:  |  Height:  |  Size: 5.0 KiB

View File

@@ -6,11 +6,11 @@
<rect x="0" y="40" width="300" height="120" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="120" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">1xFheUint64 (64 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">1.53 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">1.66 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">4xFheUint64 (256 bits) </text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">1.55 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">1.66 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">32xFheUint64 (2048 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">1.76 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">1.8 s</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 2.1 KiB

View File

@@ -8,17 +8,17 @@
<rect x="0" y="40" width="300" height="120" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="120" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">1xFheUint64 (64 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">209 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">43.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">67.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">276 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">44.0 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">66.0 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">4xFheUint64 (256 bits) </text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">211 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">277 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">44.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">72.9 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">70.3 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">32xFheUint64 (2048 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">219 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">293 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">49.1 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">185 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">184 ms</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 3.5 KiB

After

Width:  |  Height:  |  Size: 3.5 KiB

View File

@@ -8,17 +8,17 @@
<rect x="0" y="40" width="300" height="120" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="120" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">1xFheUint64 (64 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">8.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">265 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">129 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">7.9 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">274 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">131 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">4xFheUint64 (256 bits) </text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">8.36 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">259 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">50.8 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">7.9 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">277 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">51.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">32xFheUint64 (2048 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">8.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">236 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">8.38 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">7.73 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">242 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">8.62 ops/s</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 3.5 KiB

After

Width:  |  Height:  |  Size: 3.5 KiB

View File

@@ -6,11 +6,11 @@
<rect x="0" y="40" width="300" height="120" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="120" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">1xFheUint64 (64 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">1.71 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">1.94 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">4xFheUint64 (256 bits) </text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">1.72 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">1.96 s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">32xFheUint64 (2048 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">1.93 s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">2.13 s</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 2.1 KiB

View File

@@ -8,17 +8,17 @@
<rect x="0" y="40" width="300" height="120" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="120" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">1xFheUint64 (64 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">214 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">31.2 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">52.5 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">292 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">31.4 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">51.8 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">4xFheUint64 (256 bits) </text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">217 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">31.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">57.3 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">294 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">31.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">56.2 ms</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">32xFheUint64 (2048 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">225 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">33.6 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">170 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">317 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">33.8 ms</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">167 ms</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 3.5 KiB

After

Width:  |  Height:  |  Size: 3.5 KiB

View File

@@ -8,17 +8,17 @@
<rect x="0" y="40" width="300" height="120" fill="#fbbc04"/>
<rect x="300" y="40" width="420" height="120" fill="#f3f3f3"/>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="60.0">1xFheUint64 (64 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">7.78 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">877 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">200 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="60.0">7.3 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="60.0">988 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="60.0">201 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="100.0">4xFheUint64 (256 bits) </text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">7.79 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">931 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">58.9 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="100.0">7.23 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="100.0">987 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="100.0">59.5 ops/s</text>
<text dominant-baseline="middle" text-anchor="start" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="6" y="140.0">32xFheUint64 (2048 bits)</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">7.77 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">993 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">8.59 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="370.0" y="140.0">7.1 ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="510.0" y="140.0">1.11 k.ops/s</text>
<text dominant-baseline="middle" text-anchor="middle" font-family="Arial" font-size="14" font-weight="normal" fill="black" x="650.0" y="140.0">8.85 ops/s</text>
<line stroke="white" stroke-width="2" x1="0" y1="0" x2="720" y2="0"/>
<line stroke="white" stroke-width="2" x1="0" y1="40" x2="720" y2="40"/>
<line stroke="white" stroke-width="2" x1="0" y1="80" x2="720" y2="80"/>

Before

Width:  |  Height:  |  Size: 3.5 KiB

After

Width:  |  Height:  |  Size: 3.5 KiB

Some files were not shown because too many files have changed in this diff Show More