Compare commits

..

5 Commits

Author SHA1 Message Date
ImmanuelSegol
b5364c24dd refactor 2024-02-28 11:41:09 -04:00
ImmanuelSegol
c2b73aee8d refactor 2024-02-28 11:37:25 -04:00
ImmanuelSegol
49663d89d3 refactor 2024-02-28 11:29:29 -04:00
ImmanuelSegol
dd509f095b refactor 2024-02-28 11:27:33 -04:00
ImmanuelSegol
9449ffd7cb refactor 2024-02-28 11:19:59 -04:00
214 changed files with 1651 additions and 7196 deletions

View File

@@ -1,10 +1,10 @@
golang:
- wrappers/golang/**/*.go
- wrappers/golang/**/*.h
- wrappers/golang/**/*.tmpl
- wrappers/golang/**/*.go'
- wrappers/golang/**/*.h'
- wrappers/golang/**/*.tmpl'
- go.mod
rust:
- wrappers/rust/**/*
- wrappers/rust
cpp:
- icicle/**/*.cu
- icicle/**/*.cuh

View File

@@ -1,39 +0,0 @@
name: Check Changed Files
on:
workflow_call:
outputs:
golang:
description: "Flag for if GoLang files changed"
value: ${{ jobs.check-changed-files.outputs.golang }}
rust:
description: "Flag for if Rust files changed"
value: ${{ jobs.check-changed-files.outputs.rust }}
cpp_cuda:
description: "Flag for if C++/CUDA files changed"
value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
jobs:
check-changed-files:
name: Check Changed Files
runs-on: ubuntu-22.04
outputs:
golang: ${{ steps.changed_files.outputs.golang }}
rust: ${{ steps.changed_files.outputs.rust }}
cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Get all changed files
id: changed-files-yaml
uses: tj-actions/changed-files@v39
# https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
with:
files_yaml_from_source_file: .github/changed-files.yml
- name: Run Changed Files script
id: changed_files
# https://github.com/tj-actions/changed-files#outputs-
run: |
echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"

View File

@@ -11,7 +11,7 @@ jobs:
name: Check Spelling
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- uses: codespell-project/actions-codespell@v2
with:
# https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip

View File

@@ -1,52 +0,0 @@
name: C++/CUDA
on:
pull_request:
branches:
- main
- dev
push:
branches:
- main
- dev
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
check-format:
name: Check Code Format
runs-on: ubuntu-22.04
needs: check-changed-files
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check clang-format
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
test-linux:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format]
strategy:
matrix:
curve: [bn254, bls12_381, bls12_377, bw6_761]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Build
working-directory: ./icicle
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
mkdir -p build
cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -DG2_DEFINED=ON -S . -B build
cmake --build build
- name: Run C++ Tests
working-directory: ./icicle/build
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ctest

View File

@@ -5,7 +5,7 @@ on:
branches:
- main
paths:
- 'docs/**'
- 'docs/*'
permissions:
contents: write

View File

@@ -21,19 +21,14 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
run-examples:
jobs:
test-examples:
runs-on: [self-hosted, Linux, X64, icicle, examples]
needs: check-changed-files
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v2
- name: c++ examples
working-directory: ./examples/c++
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
# loop over all directories in the current directory
for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
@@ -47,7 +42,6 @@ jobs:
done
- name: Rust examples
working-directory: ./examples/rust
if: needs.check-changed-files.outputs.rust == 'true'
run: |
# loop over all directories in the current directory
for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do

View File

@@ -1,119 +0,0 @@
name: GoLang
on:
pull_request:
branches:
- main
- dev
push:
branches:
- main
- dev
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
check-format:
name: Check Code Format
runs-on: ubuntu-22.04
needs: check-changed-files
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup go
uses: actions/setup-go@v5
with:
go-version: '1.20.0'
- name: Check gofmt
if: needs.check-changed-files.outputs.golang == 'true'
run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
build-linux:
name: Build on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format]
strategy:
matrix:
curve: [bn254, bls12_381, bls12_377, bw6_761]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Setup go
uses: actions/setup-go@v5
with:
go-version: '1.20.0'
- name: Build
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ./build.sh ${{ matrix.curve }} ON ON # builds a single curve with G2 and ECNTT enabled
- name: Upload ICICLE lib artifacts
uses: actions/upload-artifact@v4
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
with:
name: icicle-builds-${{ matrix.curve }}-${{ github.workflow }}-${{ github.sha }}
path: icicle/build/libingo_${{ matrix.curve }}.a
retention-days: 1
test-linux:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, build-linux]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Setup go
uses: actions/setup-go@v5
with:
go-version: '1.20.0'
- name: Download ICICLE lib artifacts
uses: actions/download-artifact@v4
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
with:
path: ./icicle/build/
merge-multiple: true
- name: Run Tests
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# -count ensures the test results are not cached
# -p controls the number of programs that can be run in parallel
run: |
export CPATH=$CPATH:/usr/local/cuda/include
go test --tags=g2 ./... -count=1 -failfast -p 2 -timeout 60m
# TODO: bw6 on windows requires more memory than the standard runner has
# Add a large runner and then enable this job
# build-windows:
# name: Build on Windows
# runs-on: windows-2022
# needs: [check-changed-files, check-format]
# strategy:
# matrix:
# curve: [bn254, bls12_381, bls12_377, bw6_761]
# steps:
# - name: Checkout Repo
# uses: actions/checkout@v4
# - name: Setup go
# uses: actions/setup-go@v5
# with:
# go-version: '1.20.0'
# - name: Download and Install Cuda
# if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# id: cuda-toolkit
# uses: Jimver/cuda-toolkit@v0.2.11
# with:
# cuda: '12.0.0'
# method: 'network'
# # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
# sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
# - name: Build libs
# if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# working-directory: ./wrappers/golang
# env:
# CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
# shell: pwsh
# run: ./build.ps1 ${{ matrix.curve }} ON # builds a single curve with G2 enabled

119
.github/workflows/main-build.yml vendored Normal file
View File

@@ -0,0 +1,119 @@
name: Build
on:
pull_request:
branches:
- main
- dev
push:
branches:
- main
- dev
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
CARGO_TERM_COLOR: always
ARCH_TYPE: native
jobs:
check-changed-files:
name: Check Changed Files
runs-on: ubuntu-22.04
outputs:
golang: ${{ steps.changed_files.outputs.golang }}
rust: ${{ steps.changed_files.outputs.rust }}
cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Get all changed files
id: changed-files-yaml
uses: tj-actions/changed-files@v39
# https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
with:
files_yaml_from_source_file: .github/changed-files.yml
- name: Run Changed Files script
id: changed_files
# https://github.com/tj-actions/changed-files#outputs-
run: |
echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
build-rust-linux:
name: Build Rust on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: check-changed-files
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Build Rust
working-directory: ./wrappers/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# Building from the root workspace will build all members of the workspace by default
run: cargo build --release --verbose
build-rust-windows:
name: Build Rust on Windows
runs-on: windows-2022
needs: check-changed-files
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Download and Install Cuda
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.11
with:
cuda: '12.0.0'
method: 'network'
# https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
- name: Build Rust Targets
working-directory: ./wrappers/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
env:
CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
# Building from the root workspace will build all members of the workspace by default
run: cargo build --release --verbose
build-golang-linux:
name: Build Golang on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: check-changed-files
strategy:
matrix:
curve: [bn254, bls12_381, bls12_377, bw6_761]
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Build CUDA libs
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
working-directory: ./wrappers/golang
run: |
export CPATH=$CPATH:/usr/local/cuda/include
./build.sh ${{ matrix.curve }} ON
# TODO: Add once Golang make file supports building for Windows
# build-golang-windows:
# name: Build Golang on Windows
# runs-on: windows-2022
# needs: check-changed-files
# steps:
# - name: Checkout Repo
# uses: actions/checkout@v3
# - name: Download and Install Cuda
# if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# uses: Jimver/cuda-toolkit@v0.2.11
# with:
# cuda: '12.0.0'
# method: 'network'
# # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
# sub-packages: '["cudart", "nvcc", "thrust"]'
# - name: Build cpp libs
# if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# run: make all
# working-directory: ./goicicle

47
.github/workflows/main-format.yml vendored Normal file
View File

@@ -0,0 +1,47 @@
name: Format
on:
pull_request:
branches:
- main
- dev
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
formatting-rust:
name: Check Rust Code Formatting
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Check rustfmt
working-directory: ./wrappers/rust
# "-name tagret -prune" removes searching in any directory named "target"
# Formatting by single file is necessary due to generated files not being present
# before building the project.
# e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
# causing rustfmt to fail.
run: if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
# - name: Check clippy
# run: cargo clippy --no-deps --all-features --all-targets
formatting-golang:
name: Check Golang Code Formatting
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Check gofmt
run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
formatting-cpp-cuda:
name: Check C++/CUDA Code Formatting
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Check clang-format
run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi

99
.github/workflows/main-test.yml vendored Normal file
View File

@@ -0,0 +1,99 @@
name: Test
on:
pull_request:
branches:
- main
- dev
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
CARGO_TERM_COLOR: always
ARCH_TYPE: native
jobs:
check-changed-files:
name: Check Changed Files
runs-on: ubuntu-22.04
outputs:
golang: ${{ steps.changed_files.outputs.golang }}
rust: ${{ steps.changed_files.outputs.rust }}
cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Get all changed files
id: changed-files-yaml
uses: tj-actions/changed-files@v39
# https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
with:
files_yaml_from_source_file: .github/changed-files.yml
- name: Run Changed Files script
id: changed_files
# https://github.com/tj-actions/changed-files#outputs-
run: |
echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
test-rust-linux:
name: Test Rust on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: check-changed-files
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Run Rust Tests
working-directory: ./wrappers/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# Running tests from the root workspace will run all workspace members' tests by default
# We need to limit the number of threads to avoid running out of memory on weaker machines
run: cargo test --release --verbose --features=g2 -- --test-threads=2
test-cpp-linux:
name: Test C++ on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: check-changed-files
strategy:
matrix:
curve: [bn254, bls12_381, bls12_377, bw6_761]
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Build C++
working-directory: ./icicle
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
mkdir -p build
cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -S . -B build
cmake --build build
- name: Run C++ Tests
working-directory: ./icicle/build
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ctest
test-golang-linux:
name: Test Golang on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: check-changed-files
# strategy:
# matrix:
# curve: [bn254, bls12_381, bls12_377, bw6_761]
steps:
- name: Checkout Repo
uses: actions/checkout@v3
- name: Build CUDA libs
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# builds all curves with g2 ON
run: |
export CPATH=$CPATH:/usr/local/cuda/include
./build.sh all ON
- name: Run Golang Tests
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
export CPATH=$CPATH:/usr/local/cuda/include
go test --tags=g2 ./... -count=1 -timeout 60m

View File

@@ -1,50 +0,0 @@
name: Release
on:
workflow_dispatch:
inputs:
releaseType:
description: 'Release type'
required: true
default: 'minor'
type: choice
options:
- patch
- minor
- major
jobs:
release:
name: Release
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
ssh-key: ${{ secrets.DEPLOY_KEY }}
- name: Setup Cache
id: cache
uses: actions/cache@v4
with:
path: |
~/.cargo/bin/
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
key: ${{ runner.os }}-cargo-${{ hashFiles('~/.cargo/bin/cargo-workspaces') }}
- name: Install cargo-workspaces
if: steps.cache.outputs.cache-hit != 'true'
run: cargo install cargo-workspaces
- name: Bump rust crate versions, commit, and tag
working-directory: wrappers/rust
# https://github.com/pksunkara/cargo-workspaces?tab=readme-ov-file#version
run: |
git config user.name release-bot
git config user.email release-bot@ingonyama.com
cargo workspaces version ${{ inputs.releaseType }} -y --no-individual-tags -m "Bump rust crates' version"
- name: Create draft release
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
LATEST_TAG=$(git describe --tags --abbrev=0)
gh release create $LATEST_TAG --generate-notes -d --verify-tag -t "Release $LATEST_TAG"

View File

@@ -1,87 +0,0 @@
name: Rust
on:
pull_request:
branches:
- main
- dev
push:
branches:
- main
- dev
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
check-format:
name: Check Code Format
runs-on: ubuntu-22.04
needs: check-changed-files
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check rustfmt
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
working-directory: ./wrappers/rust
# "-name target -prune" removes searching in any directory named "target"
# Formatting by single file is necessary due to generated files not being present
# before building the project.
# e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
# causing rustfmt to fail.
run: if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
build-linux:
name: Build on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Build
working-directory: ./wrappers/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# Building from the root workspace will build all members of the workspace by default
run: cargo build --release --verbose
test-linux:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, build-linux]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Run tests
working-directory: ./wrappers/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# Running tests from the root workspace will run all workspace members' tests by default
# We need to limit the number of threads to avoid running out of memory on weaker machines
run: cargo test --release --verbose --features=g2 -- --test-threads=2
build-windows:
name: Build on Windows
runs-on: windows-2022
needs: check-changed-files
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Download and Install Cuda
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.11
with:
cuda: '12.0.0'
method: 'network'
# https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
- name: Build targets
working-directory: ./wrappers/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
env:
CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
# Building from the root workspace will build all members of the workspace by default
run: cargo build --release --verbose

View File

@@ -9,7 +9,7 @@ on:
jobs:
test-deploy:
name: Test deployment of docs website
name: Test deployment of docs webiste
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

View File

@@ -15,7 +15,7 @@ ENV PATH="/root/.cargo/bin:${PATH}"
# Install Golang
ENV GOLANG_VERSION 1.21.1
RUN curl -L https://go.dev/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
RUN curl -L https://golang.org/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
ENV PATH="/usr/local/go/bin:${PATH}"
# Set the working directory in the container

View File

@@ -1,6 +1,6 @@
# ICICLE
<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>
**<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**
<p align="center">
<img alt="ICICLE" width="300" height="300" src="https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png"/>
@@ -11,12 +11,10 @@
</a>
<a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
<img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
<a href="https://github.com/ingonyama-zk/icicle/releases">
<img src="https://img.shields.io/github/v/release/ingonyama-zk/icicle" alt="GitHub Release">
</a>
<img src="https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue" alt="Machines running ICICLE">
</p>
## Background
Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.
@@ -115,10 +113,8 @@ This will ensure our custom hooks are run and will make it easier to follow our
- [Robik](https://github.com/robik75), for his ongoing support and mentorship
- [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab
- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
- [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
- [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile
## Help & Support

View File

@@ -1,105 +1,3 @@
# Golang bindings
Golang bindings allow you to use ICICLE as a golang library.
The source code for all Golang libraries can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).
The Golang bindings are comprised of multiple packages.
[`core`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/core) which defines all shared methods and structures, such as configuration structures, or memory slices.
[`cuda-runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/cuda_runtime) which defines abstractions for CUDA methods for allocating memory, initializing and managing streams, and `DeviceContext` which enables users to define and keep track of devices.
Each curve has its own package which you can find [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves). If your project uses BN254 you only need to install that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
## Using ICICLE Golang bindings in your project
To add ICICLE to your `go.mod` file.
```bash
go get github.com/ingonyama-zk/icicle
```
If you want to specify a specific branch
```bash
go get github.com/ingonyama-zk/icicle@<branch_name>
```
For a specific commit
```bash
go get github.com/ingonyama-zk/icicle@<commit_id>
```
To build the shared libraries you can run this script:
```
./build <curve> [G2_enabled]
curve - The name of the curve to build or "all" to build all curves
G2_enabled - Optional - To build with G2 enabled
```
For example if you want to build all curves with G2 enabled you would run:
```bash
./build.sh all ON
```
If you are interested in building a specific curve you would run:
```bash
./build.sh bls12_381 ON
```
Now you can import ICICLE into your project
```golang
import (
"github.com/stretchr/testify/assert"
"testing"
"github.com/ingonyama-zk/icicle/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
...
```
## Running tests
To run all tests, for all curves:
```bash
go test --tags=g2 ./... -count=1
```
If you dont want to include g2 tests then drop `--tags=g2`.
If you wish to run test for a specific curve:
```bash
go test <path_to_curve> -count=1
```
## How do Golang bindings work?
The libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
1. These libraries (named `libingo_<curve>.a`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
2. In your Go project, you can use `cgo` to link these libraries. Here's a basic example on how you can use `cgo` to link these libraries:
```go
/*
#cgo LDFLAGS: -L/path/to/shared/libs -lingo_bn254
#include "icicle.h" // make sure you use the correct header file(s)
*/
import "C"
func main() {
// Now you can call the C functions from the ICICLE libraries.
// Note that C function calls are prefixed with 'C.' in Go code.
}
```
Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.

View File

@@ -1,92 +0,0 @@
# MSM Pre computation
To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
### Supported curves
`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
## Core package
## MSM `PrecomputeBases`
`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves.
#### Description
This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
#### `PrecomputeBases`
Precomputes bases for MSM by extending each base point with its multiples.
```go
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
```
##### Parameters
- **`points`**: A slice of the original affine points to be extended with their multiples.
- **`precomputeFactor`**: Determines the total number of points to precompute for each base point.
- **`c`**: Currently unused; reserved for future compatibility.
- **`ctx`**: CUDA device context specifying the execution environment.
- **`outputBases`**: The device slice allocated for storing the extended bases.
##### Example
```go
cfg := GetDefaultMSMConfig()
points := GenerateAffinePoints(1024)
precomputeFactor := 8
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
err := PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
if err != cr.CudaSuccess {
log.Fatalf("PrecomputeBases failed: %v", err)
}
```
#### `G2PrecomputeBases`
This method is the same as `PrecomputeBases` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.
```go
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
```
##### Parameters
- **`points`**: A slice of G2 curve points to be extended.
- **`precomputeFactor`**: The total number of points to precompute for each base.
- **`c`**: Reserved for future use to ensure compatibility with MSM operations.
- **`ctx`**: Specifies the CUDA device context for execution.
- **`outputBases`**: Allocated device slice for the extended bases.
##### Example
```go
cfg := G2GetDefaultMSMConfig()
points := G2GenerateAffinePoints(1024)
precomputeFactor := 8
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
err := G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
if err != cr.CudaSuccess {
log.Fatalf("G2PrecomputeBases failed: %v", err)
}
```
### Benchmarks
Benchmarks where performed on a Nvidia RTX 3090Ti.
| Pre-computation factor | bn254 size `2^20` MSM, ms. | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
| ------------- | ------------- | ------------- | ------------- | ------------- |
| 1 | 14.1 | 82.8 | 25.5 | 136.7 |
| 2 | 11.8 | 76.6 | 20.3 | 123.8 |
| 4 | 10.9 | 73.8 | 18.1 | 117.8 |
| 8 | 10.6 | 73.7 | 17.2 | 116.0 |

View File

@@ -1,200 +0,0 @@
# MSM
### Supported curves
`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
## MSM Example
```go
package main
import (
"github.com/ingonyama-zk/icicle/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
func Main() {
// Obtain the default MSM configuration.
cfg := GetDefaultMSMConfig()
// Define the size of the problem, here 2^18.
size := 1 << 18
// Generate scalars and points for the MSM operation.
scalars := GenerateScalars(size)
points := GenerateAffinePoints(size)
// Create a CUDA stream for asynchronous operations.
stream, _ := cr.CreateStream()
var p Projective
// Allocate memory on the device for the result of the MSM operation.
var out core.DeviceSlice
_, e := out.MallocAsync(p.Size(), p.Size(), stream)
if e != cr.CudaSuccess {
panic(e)
}
// Set the CUDA stream in the MSM configuration.
cfg.Ctx.Stream = &stream
cfg.IsAsync = true
// Perform the MSM operation.
e = Msm(scalars, points, &cfg, out)
if e != cr.CudaSuccess {
panic(e)
}
// Allocate host memory for the results and copy the results from the device.
outHost := make(core.HostSlice[Projective], 1)
cr.SynchronizeStream(&stream)
outHost.CopyFromDevice(&out)
// Free the device memory allocated for the results.
out.Free()
}
```
## MSM Method
```go
func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *core.MSMConfig, results core.HostOrDeviceSlice) cr.CudaError
```
### Parameters
- **scalars**: A slice containing the scalars for multiplication. It can reside either in host memory or device memory.
- **points**: A slice containing the points to be multiplied with scalars. Like scalars, these can also be in host or device memory.
- **cfg**: A pointer to an `MSMConfig` object, which contains various configuration options for the MSM operation.
- **results**: A slice where the results of the MSM operation will be stored. This slice can be in host or device memory.
### Return Value
- **CudaError**: Returns a CUDA error code indicating the success or failure of the MSM operation.
## MSMConfig
The `MSMConfig` structure holds configuration parameters for the MSM operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
```go
type MSMConfig struct {
Ctx cr.DeviceContext
PrecomputeFactor int32
C int32
Bitsize int32
LargeBucketFactor int32
batchSize int32
areScalarsOnDevice bool
AreScalarsMontgomeryForm bool
arePointsOnDevice bool
ArePointsMontgomeryForm bool
areResultsOnDevice bool
IsBigTriangle bool
IsAsync bool
}
```
### Fields
- **Ctx**: Device context containing details like device id and stream.
- **PrecomputeFactor**: Controls the number of extra points to pre-compute.
- **C**: Window bitsize, a key parameter in the "bucket method" for MSM.
- **Bitsize**: Number of bits of the largest scalar.
- **LargeBucketFactor**: Sensitivity to frequently occurring buckets.
- **batchSize**: Number of results to compute in one batch.
- **areScalarsOnDevice**: Indicates if scalars are located on the device.
- **AreScalarsMontgomeryForm**: True if scalars are in Montgomery form.
- **arePointsOnDevice**: Indicates if points are located on the device.
- **ArePointsMontgomeryForm**: True if point coordinates are in Montgomery form.
- **areResultsOnDevice**: Indicates if results are stored on the device.
- **IsBigTriangle**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **IsAsync**: If true, runs MSM asynchronously.
### Default Configuration
Use `GetDefaultMSMConfig` to obtain a default configuration, which can then be customized as needed.
```go
func GetDefaultMSMConfig() MSMConfig
```
## How do I toggle between the supported algorithms?
When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
```go
...
// Obtain the default MSM configuration.
cfg := GetDefaultMSMConfig()
cfg.Ctx.IsBigTriangle = true
...
```
## How do I toggle between MSM modes?
Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `MSM` function.
The number of results is interpreted from the size of `var out core.DeviceSlice`. Thus its important when allocating memory for `var out core.DeviceSlice` to make sure that you are allocating `<number of results> X <size of a single point>`.
```go
...
batchSize := 3
var p G2Projective
var out core.DeviceSlice
out.Malloc(batchSize*p.Size(), p.Size())
...
```
## Support for G2 group
To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled.
```bash
./build.sh bls12_381 ON
```
Now when importing `icicle`, you should have access to G2 features.
```go
import (
"github.com/ingonyama-zk/icicle/wrappers/golang/core"
)
```
These features include `G2Projective` and `G2Affine` points as well as a `G2Msm` method.
```go
...
cfg := GetDefaultMSMConfig()
size := 1 << 12
batchSize := 3
totalSize := size * batchSize
scalars := GenerateScalars(totalSize)
points := G2GenerateAffinePoints(totalSize)
var p G2Projective
var out core.DeviceSlice
out.Malloc(batchSize*p.Size(), p.Size())
G2Msm(scalars, points, &cfg, out)
...
```
`G2Msm` works the same way as normal MSM, the difference is that it uses G2 Points.
Additionally when you are building your application make sure to use the g2 feature flag
```bash
go build -tags=g2
```

View File

@@ -1,139 +0,0 @@
# Multi GPU APIs
To learn more about the theory of Multi GPU programming refer to [this part](../multi-gpu.md) of documentation.
Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)
## A Multi GPU example
In this example we will display how you can
1. Fetch the number of devices installed on a machine
2. For every GPU launch a thread and set an active device per thread.
3. Execute a MSM on each GPU
```go
func main() {
numDevices, _ := cuda_runtime.GetDeviceCount()
fmt.Println("There are ", numDevices, " devices available")
wg := sync.WaitGroup{}
for i := 0; i < numDevices; i++ {
wg.Add(1)
// RunOnDevice makes sure each MSM runs on a single thread
cuda_runtime.RunOnDevice(i, func(args ...any) {
defer wg.Done()
cfg := GetDefaultMSMConfig()
cfg.IsAsync = true
for _, power := range []int{10, 18} {
size := 1 << power // 2^pwr
// generate random scalars
scalars := GenerateScalars(size)
points := GenerateAffinePoints(size)
// create a stream and allocate result pointer
stream, _ := cuda_runtime.CreateStream()
var p Projective
var out core.DeviceSlice
_, e := out.MallocAsync(p.Size(), p.Size(), stream)
// assign stream to device context
cfg.Ctx.Stream = &stream
// execute MSM
e = Msm(scalars, points, &cfg, out)
// read result from device
outHost := make(core.HostSlice[Projective], 1)
outHost.CopyFromDeviceAsync(&out, stream)
out.FreeAsync(stream)
// sync the stream
cr.SynchronizeStream(&stream)
}
})
}
wg.Wait()
}
```
This example demonstrates a basic pattern for distributing tasks across multiple GPUs. The `RunOnDevice` function ensures that each goroutine is executed on its designated GPU and a corresponding thread.
## Device Management API
To streamline device management we offer as part of `cuda_runtime` package methods for dealing with devices.
### `RunOnDevice`
Runs a given function on a specific GPU device, ensuring that all CUDA calls within the function are executed on the selected device.
In Go, most concurrency can be done via Goroutines. However, there is no guarantee that a goroutine stays on a specific host thread.
`RunOnDevice` was designed to solve this caveat and insure that the goroutine will stay on a specific host thread.
`RunOnDevice` will lock a goroutine into a specific host thread, sets a current GPU device, runs a provided function, and unlocks the goroutine from the host thread after the provided function finishes.
While the goroutine is locked to the host thread, the Go runtime will not assign other goroutine's to that host thread.
**Parameters:**
- `deviceId int`: The ID of the device on which to run the provided function. Device IDs start from 0.
- `funcToRun func(args ...any)`: The function to be executed on the specified device.
- `args ...any`: Arguments to be passed to `funcToRun`.
**Behavior:**
- The function `funcToRun` is executed in a new goroutine that is locked to a specific OS thread to ensure that all CUDA calls within the function target the specified device.
- It's important to note that any goroutines launched within `funcToRun` are not automatically bound to the same GPU device. If necessary, `RunOnDevice` should be called again within such goroutines with the same `deviceId`.
**Example:**
```go
RunOnDevice(0, func(args ...any) {
fmt.Println("This runs on GPU 0")
// CUDA-related operations here will target GPU 0
}, nil)
```
### `SetDevice`
Sets the active device for the current host thread. All subsequent CUDA calls made from this thread will target the specified device.
**Parameters:**
- `device int`: The ID of the device to set as the current device.
**Returns:**
- `CudaError`: Error code indicating the success or failure of the operation.
### `GetDeviceCount`
Retrieves the number of CUDA-capable devices available on the host.
**Returns:**
- `(int, CudaError)`: The number of devices and an error code indicating the success or failure of the operation.
### `GetDevice`
Gets the ID of the currently active device for the calling host thread.
**Returns:**
- `(int, CudaError)`: The ID of the current device and an error code indicating the success or failure of the operation.
### `GetDeviceFromPointer`
Retrieves the device associated with a given pointer.
**Parameters:**
- `ptr unsafe.Pointer`: Pointer to query.
**Returns:**
- `int`: The device ID associated with the memory pointed to by `ptr`.
This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.

View File

@@ -1,104 +0,0 @@
# NTT
### Supported curves
`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
## NTT Example
```go
package main
import (
"github.com/ingonyama-zk/icicle/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
func Main() {
// Obtain the default NTT configuration with a predefined coset generator.
cfg := GetDefaultNttConfig()
// Define the size of the input scalars.
size := 1 << 18
// Generate scalars for the NTT operation.
scalars := GenerateScalars(size)
// Set the direction of the NTT (forward or inverse).
dir := core.KForward
// Allocate memory for the results of the NTT operation.
results := make(core.HostSlice[ScalarField], size)
// Perform the NTT operation.
err := Ntt(scalars, dir, &cfg, results)
if err != cr.CudaSuccess {
panic("NTT operation failed")
}
}
```
## NTT Method
```go
func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
```
### Parameters
- **scalars**: A slice containing the input scalars for the transform. It can reside either in host memory or device memory.
- **dir**: The direction of the NTT operation (`KForward` or `KInverse`).
- **cfg**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
- **results**: A slice where the results of the NTT operation will be stored. This slice can be in host or device memory.
### Return Value
- **CudaError**: Returns a CUDA error code indicating the success or failure of the NTT operation.
## NTT Configuration (NTTConfig)
The `NTTConfig` structure holds configuration parameters for the NTT operation, allowing customization of its behavior to optimize performance based on the specifics of your protocol.
```go
type NTTConfig[T any] struct {
Ctx cr.DeviceContext
CosetGen T
BatchSize int32
ColumnsBatch bool
Ordering Ordering
areInputsOnDevice bool
areOutputsOnDevice bool
IsAsync bool
NttAlgorithm NttAlgorithm
}
```
### Fields
- **Ctx**: Device context containing details like device ID and stream ID.
- **CosetGen**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
- **BatchSize**: The number of NTTs to compute in one operation, defaulting to 1.
- **ColumnsBatch**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
- **Ordering**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`, `KMN`, `KNM`), affecting how data is arranged.
- **areInputsOnDevice**: Indicates if input scalars are located on the device.
- **areOutputsOnDevice**: Indicates if results are stored on the device.
- **IsAsync**: Controls whether the NTT operation runs asynchronously.
- **NttAlgorithm**: Explicitly select the NTT algorithm. Default value: Auto (the implementation selects radix-2 or mixed-radix algorithm based on heuristics).
### Default Configuration
Use `GetDefaultNTTConfig` to obtain a default configuration, customizable as needed.
```go
func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
```
### Initializing the NTT Domain
Before performing NTT operations, it's necessary to initialize the NTT domain; it only needs to be called once per GPU since the twiddles are cached.
```go
func InitDomain(primitiveRoot ScalarField, ctx cr.DeviceContext, fastTwiddles bool) core.IcicleError
```
This function initializes the domain with a given primitive root, optionally using fast twiddle factors to optimize the computation.

View File

@@ -1,132 +0,0 @@
# Vector Operations
## Overview
The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
## Example
### Vector addition
```go
package main
import (
"github.com/ingonyama-zk/icicle/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
func main() {
testSize := 1 << 12
a := GenerateScalars(testSize)
b := GenerateScalars(testSize)
out := make(core.HostSlice[ScalarField], testSize)
cfg := core.DefaultVecOpsConfig()
// Perform vector addition
err := VecOp(a, b, out, cfg, core.Add)
if err != cr.CudaSuccess {
panic("Vector addition failed")
}
}
```
### Vector Subtraction
```go
package main
import (
"github.com/ingonyama-zk/icicle/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
func main() {
testSize := 1 << 12
a := GenerateScalars(testSize)
b := GenerateScalars(testSize)
out := make(core.HostSlice[ScalarField], testSize)
cfg := core.DefaultVecOpsConfig()
// Perform vector subtraction
err := VecOp(a, b, out, cfg, core.Sub)
if err != cr.CudaSuccess {
panic("Vector subtraction failed")
}
}
```
### Vector Multiplication
```go
package main
import (
"github.com/ingonyama-zk/icicle/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
func main() {
testSize := 1 << 12
a := GenerateScalars(testSize)
b := GenerateScalars(testSize)
out := make(core.HostSlice[ScalarField], testSize)
cfg := core.DefaultVecOpsConfig()
// Perform vector multiplication
err := VecOp(a, b, out, cfg, core.Mul)
if err != cr.CudaSuccess {
panic("Vector multiplication failed")
}
}
```
## VecOps Method
```go
func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError)
```
### Parameters
- **a**: The first input vector.
- **b**: The second input vector.
- **out**: The output vector where the result of the operation will be stored.
- **config**: A `VecOpsConfig` object containing various configuration options for the vector operations.
- **op**: The operation to perform, specified as one of the constants (`Sub`, `Add`, `Mul`) from the `VecOps` type.
### Return Value
- **CudaError**: Returns a CUDA error code indicating the success or failure of the vector operation.
## VecOpsConfig
The `VecOpsConfig` structure holds configuration parameters for the vector operations, allowing customization of its behavior.
```go
type VecOpsConfig struct {
Ctx cr.DeviceContext
isAOnDevice bool
isBOnDevice bool
isResultOnDevice bool
IsResultMontgomeryForm bool
IsAsync bool
}
```
### Fields
- **Ctx**: Device context containing details like device ID and stream ID.
- **isAOnDevice**: Indicates if vector `a` is located on the device.
- **isBOnDevice**: Indicates if vector `b` is located on the device.
- **isResultOnDevice**: Specifies where the result vector should be stored (device or host memory).
- **IsResultMontgomeryForm**: Determines if the result vector should be in Montgomery form.
- **IsAsync**: Controls whether the vector operation runs asynchronously.
### Default Configuration
Use `DefaultVecOpsConfig` to obtain a default configuration, customizable as needed.
```go
func DefaultVecOpsConfig() VecOpsConfig
```

View File

@@ -1,8 +1,8 @@
# What is ICICLE?
[![GitHub Release](https://img.shields.io/github/v/release/ingonyama-zk/icicle)](https://github.com/ingonyama-zk/icicle/releases)
[![Static Badge](https://img.shields.io/badge/Latest-v1.4.0-8a2be2)](https://github.com/ingonyama-zk/icicle/releases)
![Static Badge](https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue)

View File

@@ -49,17 +49,13 @@ Accelerating MSM is crucial to a ZK protocol's performance due to the [large per
You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).
# Using MSM
## Supported curves
MSM supports the following curves:
`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
## Supported Bindings
- [Golang](../golang-bindings/msm.md)
- [Rust](../rust-bindings//msm.md)
`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
## Supported algorithms
@@ -83,6 +79,25 @@ Large Triangle Accumulation is a method for optimizing MSM which focuses on redu
The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a <b>large batch of smaller MSM computations</b>.
### How do I toggle between the supported algorithms?
When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
```rust
...
let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
// is_big_triangle will determine which algorithm to use
cfg_bls12377.is_big_triangle = true;
msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
...
```
You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
## MSM Modes
ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`
@@ -94,3 +109,54 @@ Batch MSM allows you to run many MSMs with a single API call, Single MSM will la
This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs leveraging the parallel processing capabilities of GPUs.
Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
### How do I toggle between MSM modes?
Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
```rust
...
let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
...
```
In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
```rust
...
let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
...
```
Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
## Support for G2 group
MSM also supports G2 group.
Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
```rust
...
let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
...
```
Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.

View File

@@ -28,10 +28,6 @@ NTT supports the following curves:
`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
## Supported Bindings
- [Golang](../golang-bindings/ntt.md)
- [Rust](../rust-bindings/ntt.md)
### Examples
@@ -39,6 +35,87 @@ NTT supports the following curves:
- [C++ API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/c%2B%2B/ntt/example.cu#L1)
## NTT API overview
```rust
pub fn ntt<F>(
input: &HostOrDeviceSlice<F>,
dir: NTTDir,
cfg: &NTTConfig<F>,
output: &mut HostOrDeviceSlice<F>,
) -> IcicleResult<()>
```
`ntt:ntt` expects:
`input` - buffer to read the inputs of the NTT from. <br/>
`dir` - whether to compute forward or inverse NTT. <br/>
`cfg` - config used to specify extra arguments of the NTT. <br/>
`output` - buffer to write the NTT outputs into. Must be of the same size as input.
The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
### NTT Config
```rust
pub struct NTTConfig<'a, S> {
pub ctx: DeviceContext<'a>,
pub coset_gen: S,
pub batch_size: i32,
pub ordering: Ordering,
are_inputs_on_device: bool,
are_outputs_on_device: bool,
pub is_async: bool,
pub ntt_algorithm: NttAlgorithm,
}
```
The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
#### Fields
- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to
#### Usage
Example initialization with default settings:
```rust
let default_config = NTTConfig::default();
```
Customizing the configuration:
```rust
let custom_config = NTTConfig {
ctx: custom_device_context,
coset_gen: my_coset_generator,
batch_size: 10,
ordering: Ordering::kRN,
are_inputs_on_device: true,
are_outputs_on_device: true,
is_async: false,
ntt_algorithm: NttAlgorithm::MixedRadix,
};
```
### Ordering
The `Ordering` enum defines how inputs and outputs are arranged for the NTT operation, offering flexibility in handling data according to different algorithmic needs or compatibility requirements. It primarily affects the sequencing of data points for the transform, which can influence both performance and the compatibility with certain algorithmic approaches. The available ordering options are:
@@ -63,6 +140,15 @@ NTT also supports two different modes `Batch NTT` and `Single NTT`
Batch NTT allows you to run many NTTs with a single API call, Single MSM will launch a single MSM computation.
You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
```rust
let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
cfg.batch_size = 10 // your ntt using this config will run in batch mode.
```
`batch_size=1` would keep our NTT in single NTT mode.
Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
**Single NTT Mode**
@@ -146,11 +232,9 @@ Mixed Radix can reduce the number of stages required to compute for large inputs
### Which algorithm should I choose ?
Both work only on inputs of power of 2 (e.g., 256, 512, 1024).
Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Its also more suited for inputs which are power of 2 (e.g., 256, 512, 1024). Radix 2 won't necessarily perform better for smaller `logn` with larger batches.
Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Radix 2 won't necessarily perform better for smaller `logn` with larger batches.
Mixed radix on the other hand works better for larger NTTs with larger input sizes.
Mixed radix on the other hand better for larger NTTs with larger input sizes which are not necessarily power of 2.
Performance really depends on logn size, batch size, ordering, inverse, coset, coeff-field and which GPU you are using.

View File

@@ -6,6 +6,5 @@ This section of the documentation is dedicated to the ICICLE primitives, we will
## Supported primitives
- [MSM](./msm.md)
- [NTT](./ntt.md)
- [MSM](./msm)
- [Poseidon Hash](./poseidon.md)

View File

@@ -1,63 +0,0 @@
# MSM Pre computation
To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
### Supported curves
`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `Grumpkin`
### `precompute_bases`
Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.
```rust
pub fn precompute_bases<C: Curve + MSM<C>>(
points: &HostOrDeviceSlice<Affine<C>>,
precompute_factor: i32,
_c: i32,
ctx: &DeviceContext,
output_bases: &mut HostOrDeviceSlice<Affine<C>>,
) -> IcicleResult<()>
```
#### Parameters
- **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
- **`precompute_factor`**: Specifies the total number of points to precompute for each base, including the base point itself. This parameter directly influences the memory requirements and the potential speedup of the MSM operation.
- **`_c`**: Currently unused. Intended for future use to align with the `c` parameter in `MSMConfig`, ensuring the precomputation is compatible with the bucket method's window size used in MSM.
- **`ctx`**: The device context specifying the device ID and stream for execution. This context determines where the precomputation is performed (e.g., on a specific GPU).
- **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.
#### Returns
`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
#### Description
This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
#### Example Usage
```rust
let device_context = DeviceContext::default_for_device(0); // Use the default device
let precompute_factor = 4; // Number of points to precompute
let mut extended_bases = HostOrDeviceSlice::cuda_malloc(expected_size).expect("Failed to allocate memory for extended bases");
// Precompute the bases using the specified factor
precompute_bases(&points, precompute_factor, 0, &device_context, &mut extended_bases)
.expect("Failed to precompute bases");
```
### Benchmarks
Benchmarks where performed on a Nvidia RTX 3090Ti.
| Pre-computation factor | bn254 size `2^20` MSM, ms. | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
| ------------- | ------------- | ------------- | ------------- | ------------- |
| 1 | 14.1 | 82.8 | 25.5 | 136.7 |
| 2 | 11.8 | 76.6 | 20.3 | 123.8 |
| 4 | 10.9 | 73.8 | 18.1 | 117.8 |
| 8 | 10.6 | 73.7 | 17.2 | 116.0 |

View File

@@ -1,172 +0,0 @@
# MSM
### Supported curves
`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
## Example
```rust
use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarCfg};
use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
fn main() {
let size: usize = 1 << 10; // Define the number of points and scalars
// Generate random points and scalars
println!("Generating random G1 points and scalars for BN254...");
let points = CurveCfg::generate_random_affine_points(size);
let scalars = ScalarCfg::generate_random(size);
// Wrap points and scalars in HostOrDeviceSlice for MSM
let points_host = HostOrDeviceSlice::Host(points);
let scalars_host = HostOrDeviceSlice::Host(scalars);
// Allocate memory on the CUDA device for MSM results
let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).expect("Failed to allocate CUDA memory for MSM results");
// Create a CUDA stream for asynchronous execution
let stream = CudaStream::create().expect("Failed to create CUDA stream");
let mut cfg = msm::MSMConfig::default();
cfg.ctx.stream = &stream;
cfg.is_async = true; // Enable asynchronous execution
// Execute MSM on the device
println!("Executing MSM on device...");
msm::msm(&scalars_host, &points_host, &cfg, &mut msm_results).expect("Failed to execute MSM");
// Synchronize CUDA stream to ensure MSM execution is complete
stream.synchronize().expect("Failed to synchronize CUDA stream");
// Optionally, move results to host for further processing or printing
println!("MSM execution complete.");
}
```
## MSM API Overview
```rust
pub fn msm<C: Curve>(
scalars: &HostOrDeviceSlice<C::ScalarField>,
points: &HostOrDeviceSlice<Affine<C>>,
cfg: &MSMConfig,
results: &mut HostOrDeviceSlice<Projective<C>>,
) -> IcicleResult<()>
```
### Parameters
- **`scalars`**: A buffer containing the scalar values to be multiplied with corresponding points.
- **`points`**: A buffer containing the points to be multiplied by the scalars.
- **`cfg`**: MSM configuration specifying additional parameters for the operation.
- **`results`**: A buffer where the results of the MSM operations will be stored.
### MSM Config
```rust
pub struct MSMConfig<'a> {
pub ctx: DeviceContext<'a>,
points_size: i32,
pub precompute_factor: i32,
pub c: i32,
pub bitsize: i32,
pub large_bucket_factor: i32,
batch_size: i32,
are_scalars_on_device: bool,
pub are_scalars_montgomery_form: bool,
are_points_on_device: bool,
pub are_points_montgomery_form: bool,
are_results_on_device: bool,
pub is_big_triangle: bool,
pub is_async: bool,
}
```
- **`ctx: DeviceContext`**: Specifies the device context, device id and the CUDA stream for asynchronous execution.
- **`point_size: i32`**:
- **`precompute_factor: i32`**: Determines the number of extra points to pre-compute for each point, affecting memory footprint and performance.
- **`c: i32`**: The "window bitsize," a parameter controlling the computational complexity and memory footprint of the MSM operation.
- **`bitsize: i32`**: The number of bits of the largest scalar, typically equal to the bit size of the scalar field.
- **`large_bucket_factor: i32`**: Adjusts the algorithm's sensitivity to frequently occurring buckets, useful for non-uniform scalar distributions.
- **`batch_size: i32`**: The number of MSMs to compute in a single batch, for leveraging parallelism.
- **`are_scalars_montgomery_form`**: Set to `true` if scalars are in montgomery form.
- **`are_points_montgomery_form`**: Set to `true` if points are in montgomery form.
- **`are_scalars_on_device: bool`**, **`are_points_on_device: bool`**, **`are_results_on_device: bool`**: Indicate whether the corresponding buffers are on the device memory.
- **`is_big_triangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **`is_async: bool`**: Whether to perform the MSM operation asynchronously.
### Usage
The `msm` function is designed to compute the sum of multiple scalar-point multiplications efficiently. It supports both single MSM operations and batched operations for increased performance. The configuration allows for detailed control over the execution environment and performance characteristics of the MSM operation.
When performing MSM operations, it's crucial to match the size of the `scalars` and `points` arrays correctly and ensure that the `results` buffer is appropriately sized to hold the output. The `MSMConfig` should be set up to reflect the specifics of the operation, including whether the operation should be asynchronous and any device-specific settings.
## How do I toggle between the supported algorithms?
When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
```rust
...
let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
// is_big_triangle will determine which algorithm to use
cfg_bls12377.is_big_triangle = true;
msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
...
```
You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
## How do I toggle between MSM modes?
Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
```rust
...
let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
...
```
In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
```rust
...
let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
...
```
Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
## Support for G2 group
MSM also supports G2 group.
Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
```rust
...
let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
...
```
Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.

View File

@@ -4,54 +4,6 @@ To learn more about the theory of Multi GPU programming refer to [this part](../
Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)
## A Multi GPU example
In this example we will display how you can
1. Fetch the number of devices installed on a machine
2. For every GPU launch a thread and set an active device per thread.
3. Execute a MSM on each GPU
```rust
...
let device_count = get_device_count().unwrap();
(0..device_count)
.into_par_iter()
.for_each(move |device_id| {
set_device(device_id).unwrap();
// you can allocate points and scalars_d here
let mut cfg = MSMConfig::default_for_device(device_id);
cfg.ctx.stream = &stream;
cfg.is_async = true;
cfg.are_scalars_montgomery_form = true;
msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
// collect and process results
})
...
```
We use `get_device_count` to fetch the number of connected devices, device IDs will be `0, 1, 2, ..., device_count - 1`
[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU.
We finally call our `msm` method.
## Device management API
To streamline device management we offer as part of `icicle-cuda-runtime` package methods for dealing with devices.
@@ -200,3 +152,50 @@ let device_id: i32 = 0; // Example device ID
check_device(device_id);
// Ensures that the current context is correctly set for the specified device ID.
```
## A Multi GPU example
In this example we will display how you can
1. Fetch the number of devices installed on a machine
2. For every GPU launch a thread and set a active device per thread.
3. Execute a MSM on each GPU
```rust
...
let device_count = get_device_count().unwrap();
(0..device_count)
.into_par_iter()
.for_each(move |device_id| {
set_device(device_id).unwrap();
// you can allocate points and scalars_d here
let mut cfg = MSMConfig::default_for_device(device_id);
cfg.ctx.stream = &stream;
cfg.is_async = true;
cfg.are_scalars_montgomery_form = true;
msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
// collect and process results
})
...
```
We use `get_device_count` to fetch the number of connected devices, device IDs will be `0...device_count-1`
[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU.
We finally call our `msm` method.

View File

@@ -1,199 +0,0 @@
# NTT
### Supported curves
`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
## Example
```rust
use icicle_bn254::curve::{ScalarCfg, ScalarField};
use icicle_core::{ntt::{self, NTT}, traits::GenerateRandom};
use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
fn main() {
let size = 1 << 12; // Define the size of your input, e.g., 2^10
let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
size.try_into()
.unwrap(),
)
// Generate random inputs
println!("Generating random inputs...");
let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
// Allocate memory on CUDA device for NTT results
let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).expect("Failed to allocate CUDA memory");
// Create a CUDA stream
let stream = CudaStream::create().expect("Failed to create CUDA stream");
let ctx = DeviceContext::default(); // Assuming default device context
ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
// Configure NTT
let mut cfg = ntt::NTTConfig::default();
cfg.ctx.stream = &stream;
cfg.is_async = true; // Set to true for asynchronous execution
// Execute NTT on device
println!("Executing NTT on device...");
ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).expect("Failed to execute NTT");
// Synchronize CUDA stream to ensure completion
stream.synchronize().expect("Failed to synchronize CUDA stream");
// Optionally, move results to host for further processing or verification
println!("NTT execution complete.");
}
```
## NTT API overview
```rust
pub fn ntt<F>(
input: &HostOrDeviceSlice<F>,
dir: NTTDir,
cfg: &NTTConfig<F>,
output: &mut HostOrDeviceSlice<F>,
) -> IcicleResult<()>
```
`ntt:ntt` expects:
`input` - buffer to read the inputs of the NTT from. <br/>
`dir` - whether to compute forward or inverse NTT. <br/>
`cfg` - config used to specify extra arguments of the NTT. <br/>
`output` - buffer to write the NTT outputs into. Must be of the same size as input.
The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
### NTT Config
```rust
pub struct NTTConfig<'a, S> {
pub ctx: DeviceContext<'a>,
pub coset_gen: S,
pub batch_size: i32,
pub columns_batch: bool,
pub ordering: Ordering,
are_inputs_on_device: bool,
are_outputs_on_device: bool,
pub is_async: bool,
pub ntt_algorithm: NttAlgorithm,
}
```
The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
#### Fields
- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
- **`columns_batch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to
#### Usage
Example initialization with default settings:
```rust
let default_config = NTTConfig::default();
```
Customizing the configuration:
```rust
let custom_config = NTTConfig {
ctx: custom_device_context,
coset_gen: my_coset_generator,
batch_size: 10,
columns_batch: false,
ordering: Ordering::kRN,
are_inputs_on_device: true,
are_outputs_on_device: true,
is_async: false,
ntt_algorithm: NttAlgorithm::MixedRadix,
};
```
### Modes
NTT supports two different modes `Batch NTT` and `Single NTT`
You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
```rust
let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
cfg.batch_size = 10 // your ntt using this config will run in batch mode.
```
`batch_size=1` would keep our NTT in single NTT mode.
Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
### Initializing the NTT Domain
Before performing NTT operations, its necessary to initialize the NTT domain, It only needs to be called once per GPU since the twiddles are cached.
```rust
ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
```
### `initialize_domain`
```rust
pub fn initialize_domain<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: NTT<F>;
```
#### Parameters
- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
#### Returns
- **`IcicleResult<()>`**: Will return an error if the operation fails.
### `initialize_domain_fast_twiddles_mode`
Similar to `initialize_domain`, `initialize_domain_fast_twiddles_mode` is a faster implementation and can be used for larger NTTs.
```rust
pub fn initialize_domain_fast_twiddles_mode<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: NTT<F>;
```
#### Parameters
- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
#### Returns
- **`IcicleResult<()>`**: Will return an error if the operation fails.

View File

@@ -1,159 +0,0 @@
# Vector Operations API
Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.
## Supported curves
Vector operations are supported on the following curves:
`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
## Examples
### Addition of Scalars
```rust
use icicle_bn254::curve::{ScalarCfg, ScalarField};
use icicle_core::vec_ops::{add_scalars};
let test_size = 1 << 18;
let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
let cfg = VecOpsConfig::default();
add_scalars(&a, &b, &mut result, &cfg).unwrap();
```
### Subtraction of Scalars
```rust
use icicle_bn254::curve::{ScalarCfg, ScalarField};
use icicle_core::vec_ops::{sub_scalars};
let test_size = 1 << 18;
let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
let cfg = VecOpsConfig::default();
sub_scalars(&a, &b, &mut result, &cfg).unwrap();
```
### Multiplication of Scalars
```rust
use icicle_bn254::curve::{ScalarCfg, ScalarField};
use icicle_core::vec_ops::{mul_scalars};
let test_size = 1 << 18;
let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
let ones: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::one(); test_size]);
let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
let cfg = VecOpsConfig::default();
mul_scalars(&a, &ones, &mut result, &cfg).unwrap();
```
## Vector Operations Configuration
The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
### `VecOpsConfig`
Defines configuration parameters for vector operations.
```rust
pub struct VecOpsConfig<'a> {
pub ctx: DeviceContext<'a>,
is_a_on_device: bool,
is_b_on_device: bool,
is_result_on_device: bool,
is_result_montgomery_form: bool,
pub is_async: bool,
}
```
#### Fields
- **`ctx: DeviceContext<'a>`**: Specifies the device context for the operation, including the device ID and memory pool.
- **`is_a_on_device`**: Indicates if the first operand vector resides in device memory.
- **`is_b_on_device`**: Indicates if the second operand vector resides in device memory.
- **`is_result_on_device`**: Specifies if the result vector should be stored in device memory.
- **`is_result_montgomery_form`**: Determines if the result should be in Montgomery form.
- **`is_async`**: Enables asynchronous operation. If `true`, operations are non-blocking; otherwise, they block the current thread.
### Default Configuration
`VecOpsConfig` can be initialized with default settings tailored for a specific device:
```
let cfg = VecOpsConfig::default();
```
These are the default settings.
```rust
impl<'a> Default for VecOpsConfig<'a> {
fn default() -> Self {
Self::default_for_device(DEFAULT_DEVICE_ID)
}
}
impl<'a> VecOpsConfig<'a> {
pub fn default_for_device(device_id: usize) -> Self {
VecOpsConfig {
ctx: DeviceContext::default_for_device(device_id),
is_a_on_device: false,
is_b_on_device: false,
is_result_on_device: false,
is_result_montgomery_form: false,
is_async: false,
}
}
}
```
## Vector Operations
Vector operations are implemented through the `VecOps` trait, these traits are implemented for all [supported curves](#supported-curves) providing methods for addition, subtraction, and multiplication of vectors.
### `VecOps` Trait
```rust
pub trait VecOps<F> {
fn add(
a: &HostOrDeviceSlice<F>,
b: &HostOrDeviceSlice<F>,
result: &mut HostOrDeviceSlice<F>,
cfg: &VecOpsConfig,
) -> IcicleResult<()>;
fn sub(
a: &HostOrDeviceSlice<F>,
b: &HostOrDeviceSlice<F>,
result: &mut HostOrDeviceSlice<F>,
cfg: &VecOpsConfig,
) -> IcicleResult<()>;
fn mul(
a: &HostOrDeviceSlice<F>,
b: &HostOrDeviceSlice<F>,
result: &mut HostOrDeviceSlice<F>,
cfg: &VecOpsConfig,
) -> IcicleResult<()>;
}
```
#### Methods
All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
- **`add`**: Computes the element-wise sum of two vectors.
- **`sub`**: Computes the element-wise difference between two vectors.
- **`mul`**: Performs element-wise multiplication of two vectors.

View File

@@ -6,56 +6,7 @@ We understand the need for ZK developers to use different curves, some common so
ICICLE core is very generic by design so all algorithms and primitives are designed to work based of configuration files [selected during compile](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh) time. This is why we compile ICICLE Core per curve.
To add support for a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
### Adding curve_name_params.cuh
Start by copying `bn254_params.cuh` contents in your params file. Params should include:
- **fq_config** - parameters of the Base field.
- **limbs_count** - `ceil(field_byte_size / 4)`.
- **modulus_bit_count** - bit-size of the modulus.
- **num_of_reductions** - the number of times to reduce in reduce function. Use 2 if not sure.
- **modulus** - modulus of the field.
- **modulus_2** - modulus * 2.
- **modulus_4** - modulus * 4.
- **neg_modulus** - negated modulus.
- **modulus_wide** - modulus represented as a double-sized integer.
- **modulus_squared** - modulus**2 represented as a double-sized integer.
- **modulus_squared_2** - 2 * modulus**2 represented as a double-sized integer.
- **modulus_squared_4** - 4 * modulus**2 represented as a double-sized integer.
- **m** - value used in multiplication. Can be computed as `2**(2*modulus_bit_count) // modulus`.
- **one** - multiplicative identity.
- **zero** - additive identity.
- **montgomery_r** - `2 ** M % modulus` where M is a closest (larger than) bitsize multiple of 32. E.g. 384 or 768 for bls and bw curves respectively
- **montgomery_r_inv** - `2 ** (-M) % modulus`
- **fp_config** - parameters of the Scalar field.
Same as fq_config, but with additional arguments:
- **omegas_count** - [two-adicity](https://cryptologie.net/article/559/whats-two-adicity/) of the field. And thus the maximum size of NTT.
- **omegas** - an array of omegas for NTTs. An array of size `omegas_count`. The ith element is equal to `1.nth_root(2**(2**(omegas_count-i)))`.
- **inv** - an array of inverses of powers of two in a field. Ith element is equal to `(2 ** (i+1)) ** -1`.
- **G1 generators points** - affine coordinates of the generator point.
- **G2 generators points** - affine coordinates of the extension generator. Remove these if `G2` is not supported.
- **Weierstrass b value** - base field element equal to value of `b` in the curve equation.
- **Weierstrass b value G2** - base field element equal to value of `b` for the extension. Remove this if `G2` is not supported.
:::note
All the params are not in Montgomery form.
:::
:::note
To convert number values into `storage` type you can use the following python function
```python
import struct
def unpack(x, field_size):
return ', '.join(["0x" + format(x, '08x') for x in struct.unpack('I' * (field_size) // 4, int(x).to_bytes(field_size, 'little'))])
```
:::
To add support a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
We also require some changes to [`curve_config.cuh`](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh#L16-L29), we need to add a new curve id.
@@ -77,40 +28,58 @@ Make sure to modify the [rest of the file](https://github.com/ingonyama-zk/icicl
Finally we must modify the [`make` file](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L64) to make sure we can compile our new curve.
```
set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin;<curve_name>)
set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;<curve_name>)
```
### Adding Poseidon support
If you want your curve to implement a Poseidon hash function or a tree builder, you will need to pre-calculate its optimized parameters.
Copy [constants_template.h](https://github.com/ingonyama-zk/icicle/blob/main/icicle/appUtils/poseidon/constants/constants_template.h) into `icicle/appUtils/poseidon/constants/<CURVE>_poseidon.h`. Run the [constants generation script](https://dev.ingonyama.com/icicle/primitives/poseidon#constants). The script will print the number of partial rounds and generate a `constants.bin` file. Use `xxd -i constants.bin` to parse the file into C declarations. Copy the `unsigned char constants_bin[]` contents inside your new file. Repeat this process for arities 2, 4, 8 and 11.
After you've generated the constants, add your curve in this [SUPPORTED_CURVES_WITH_POSEIDON](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L72) in the `CMakeLists.txt`.
## Bindings
In order to support a new curve in the binding libraries you first must support it in ICICLE core.
In order to support a new curves in the binding libraries you first must support it in ICICLE core.
### Rust
Go to [rust curves folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves) and copy `icicle-curve-template` to a new folder named `icicle-<curve_name>`.
Create a new folder named `icicle-<curve_name>` under the [rust wrappers folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves). Your new directory should look like this.
Find all the occurrences of `<CURVE>` placeholder inside the crate. (You can use `Ctrl+Shift+F` in VS Code or `grep -nr "<CURVE>"` in bash). You will then need to replace each occurrence with your new curve name.
```
└── rust
├── icicle-curves
├── icicle-<curve_name>
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │   ├── curve.rs
│   │   ├── lib.rs
│   │   ├── msm/
│   │   │   └── mod.rs
│   │   └── ntt/
│   │   └── mod.rs
```
#### Limbs
Lets look at [`ntt/mod.rs`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs) for example.
Go to your curve's `curve.rs` file and set `SCALAR_LIMBS`, `BASE_LIMBS` and `G2_BASE_LIMBS` (if G2 is needed) to a minimum number of `u64` required to store a single scalar field / base field element respectively.
e.g. for bn254, scalar field is 254 bit so `SCALAR_LIMBS` is set to 4.
```
...
#### Primitives
extern "C" {
#[link_name = "bn254NTTCuda"]
fn ntt_cuda<'a>(
input: *const ScalarField,
size: usize,
is_inverse: bool,
config: &NTTConfig<'a, ScalarField>,
output: *mut ScalarField,
) -> CudaError;
If your curve doesn't support some of the primitives (ntt/msm/poseidon/merkle tree/), or you simply don't want to include it, just remove a corresponding module from `src` and then from `lib.rs`
#[link_name = "bn254DefaultNTTConfig"]
fn default_ntt_config() -> NTTConfig<'static, ScalarField>;
#### G2
#[link_name = "bn254InitializeDomain"]
fn initialize_ntt_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaError;
}
If your curve doesn't support G2 - remove all the code under `#[cfg(feature = "g2")]` and remove the feature from [Cargo.toml](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml#L29) and [build.rs](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs#L15).
...
```
After this is done, add your new crate in the [global Cargo.toml](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/Cargo.toml).
Here you would need to replace `bn254NTTCuda` with `<curve_name>NTTCuda`. Most of these changes are pretty straight forward. One thing you should pay attention to is limb sizes as these change for different curves. For example `BN254` [has limb size of 8](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/wrappers/rust/icicle-curves/icicle-bn254/src/curve.rs#L15) but for your curve this may be different.
### Golang

View File

@@ -9,7 +9,7 @@ const config = {
title: 'Ingonyama Developer Documentation',
tagline: 'Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge Proof hardware acceleration. We build accelerators for advanced cryptography, unlocking real-time applications.',
url: 'https://dev.ingonyama.com/',
baseUrl: '/',
baseUrl: '/icicle/',
onBrokenLinks: 'throw',
onBrokenMarkdownLinks: 'warn',
favicon: 'img/logo.png',
@@ -29,13 +29,13 @@ const config = {
remarkPlugins: [math, require('mdx-mermaid')],
rehypePlugins: [katex],
sidebarPath: require.resolve('./sidebars.js'),
editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
},
blog: {
remarkPlugins: [math, require('mdx-mermaid')],
rehypePlugins: [katex],
showReadingTime: true,
editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
},
pages: {},
theme: {

View File

@@ -25,46 +25,9 @@ module.exports = {
id: "icicle/integrations"
},
{
type: "category",
type: "doc",
label: "Golang bindings",
link: {
type: `doc`,
id: "icicle/golang-bindings",
},
collapsed: true,
items: [
{
type: "category",
label: "MSM",
link: {
type: `doc`,
id: "icicle/golang-bindings/msm",
},
collapsed: true,
items: [
{
type: "doc",
label: "MSM pre computation",
id: "icicle/golang-bindings/msm-pre-computation",
}
]
},
{
type: "doc",
label: "NTT",
id: "icicle/golang-bindings/ntt",
},
{
type: "doc",
label: "Vector operations",
id: "icicle/golang-bindings/vec-ops",
},
{
type: "doc",
label: "Multi GPU Support",
id: "icicle/golang-bindings/multi-gpu",
},
]
id: "icicle/golang-bindings",
},
{
type: "category",
@@ -75,38 +38,12 @@ module.exports = {
},
collapsed: true,
items: [
{
type: "category",
label: "MSM",
link: {
type: `doc`,
id: "icicle/rust-bindings/msm",
},
collapsed: true,
items: [
{
type: "doc",
label: "MSM pre computation",
id: "icicle/rust-bindings/msm-pre-computation",
}
]
},
{
type: "doc",
label: "NTT",
id: "icicle/rust-bindings/ntt",
},
{
type: "doc",
label: "Vector operations",
id: "icicle/rust-bindings/vec-ops",
},
{
type: "doc",
label: "Multi GPU Support",
id: "icicle/rust-bindings/multi-gpu",
},
],
}
]
},
{
type: "category",
@@ -122,16 +59,16 @@ module.exports = {
label: "MSM",
id: "icicle/primitives/msm",
},
{
type: "doc",
label: "NTT",
id: "icicle/primitives/ntt",
},
{
type: "doc",
label: "Poseidon Hash",
id: "icicle/primitives/poseidon",
},
{
type: "doc",
label: "NTT",
id: "icicle/primitives/ntt",
}
],
},
{

View File

@@ -1,25 +0,0 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(icicle LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
# change the path to your Icicle location
include_directories("../../../icicle")
add_executable(
example
example.cu
)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -1,33 +0,0 @@
# ICICLE example: Pedersen Commitment
## Best-Practices
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
## Key-Takeaway
A Pedersen Commitment is a cryptographic primitive to commit to a value or a vector of values while keeping it hidden, yet enabling the committer to reveal the value later. It provides both hiding (the commitment does not reveal any information about the value) and binding properties (once a value is committed, it cannot be changed without detection).
Pedersen commitment is based on Multi-Scalar Multiplication [MSM](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
`ICICLE` provides CUDA C++ support for [MSM](https://dev.ingonyama.com/icicle/primitives/msm).
An example of MSM is [here](../msm/README.md).
## Running the example
- `cd` to your example directory
- compile with `./compile.sh`
- run with `./run.sh`
## Concise Explanation
We recommend this simple [explanation](https://www.rareskills.io/post/pedersen-commitment).
The original paper: T. P. Pedersen, "Non-Interactive and Information-Theoretic Secure Verifiable Secret Sharing," in Advances in Cryptology — CRYPTO 91, Lecture Notes in Computer Science, vol 576. Springer, Berlin, Heidelberg.
## What's in the example
1. Define the curve and the size of commitment vector
2. Use public random seed to transparently generate points on the elliptic curve without known discrete logarithm
3. Generate (random) commitment vector and salt (a.k.a blinding factor)
4. Configure and execute MSM using on-host data
5. Output commitment as elliptic point

View File

@@ -1,9 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
rm -rf build
mkdir -p build
cmake -S . -B build
cmake --build build

View File

@@ -1,159 +0,0 @@
#include <iostream>
#include <iomanip>
#include <chrono>
#include <cassert>
#include <nvml.h>
#define CURVE_ID BN254
#include "appUtils/msm/msm.cu"
using namespace curve_config;
typedef point_field_t T;
// modular power
T modPow(T base, T exp) {
T r = T::one();
T b = base;
T e = exp;
while (e != T::zero()) {
// If exp is odd, multiply the base with result
if (T::is_odd(e)) {
r = r * b;
}
// Now exp must be even, divide it by 2
e =T::div2(e);
b = b * b;
}
return r;
}
// Check if y2 is a quadratic residue using Euler's Criterion
bool quadratic_residue(T y2) {
return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
}
// modular square root adapted from:
// https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
bool mySQRT(T a, T *result) {
if (a == T::zero()) {
*result = T::zero();
return true;
}
if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
return false; // solution does not exist
}
// TODO: consider special cases
// if (p % 4 == 3) return power(a, (p + 1) / 4, p);
T s = T::zero() - T::one(); // p - 1,
T n = T::one() + T::one(); //2;
T r = T::zero();
T m;
while (T::is_even(s)) {
r = r + T::one();
s = T::div2(s); //s /= 2;
}
// find a non-square mod p
while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
n = n + T::one();
}
T x = modPow(a, T::div2(s + T::one()));
T b = modPow(a, s);
T g = modPow(n, s);
for (;; r = m) {
T t = b;
for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t = t * t;
if (m == T::zero() ) {
*result = x;
return true;
}
T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
g = gs * gs ;
x = x * gs ;
b = b * g ;
}
}
void point_near_x(T x, affine_t *point) {
const T wb = T { weierstrass_b };
T y2;
while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
{
x = x + T::one();
};
T y;
bool found = mySQRT(y2, &y);
assert(y*y == y2);
point->x = x;
point->y = y;
}
static int seed = 0;
static HOST_INLINE T rand_host_seed()
{
std::mt19937_64 generator(seed++);
std::uniform_int_distribution<unsigned> distribution;
T value;
for (unsigned i = 0; i < T::TLC-1 ; i++)
// TODO: use the full range of limbs: for (unsigned i = 0; i < T::TLC ; i++)
value.limbs_storage.limbs[i] = distribution(generator);
// while (lt(Field{get_modulus()}, value))
// value = value - Field{get_modulus()};
return value;
}
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char** argv)
{
const unsigned N = pow(2, 10);
std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
T* xs = new T[N+1];
std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
seed = 1234;
std::cout << "Using seed: " << seed << std::endl;
std::cout << "Generating random field values" << std::endl;
START_TIMER(gen);
for (unsigned i = 0; i < N; i++) {
xs[i] = rand_host_seed();
}
END_TIMER(gen, "Time to generate field values");
std::cout << "xs[0]: " << xs[0] << std::endl;
std::cout << "xs[1]: " << xs[1] << std::endl;
// affine_t points[N];
affine_t* points = new affine_t[N+1];
std::cout << "Generating point about random field values" << std::endl;
START_TIMER(points);
for (unsigned i = 0; i < N+1; i++) {
point_near_x(xs[i], &points[i]);
}
END_TIMER(points, "Time to generate points");
std::cout << "Generating commitment vector" << std::endl;
projective_t result;
scalar_t* scalars = new scalar_t[N+1];
scalar_t::RandHostMany(scalars, N);
std::cout << "Generating salt" << std::endl;
scalars[N] = scalar_t::rand_host();
std::cout << "Executing MSM" << std::endl;
auto config = msm::DefaultMSMConfig<scalar_t>();
START_TIMER(msm);
msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, N+1, config, &result);
END_TIMER(msm, "Time to execute MSM");
std::cout << "Computed commitment: " << result << std::endl;
std::cout << "Cleaning up..." << std::endl;
delete[] xs;
delete[] scalars;
delete[] points;
return 0;
}

View File

@@ -1,2 +0,0 @@
#!/bin/bash
./build/example

View File

@@ -84,7 +84,7 @@ int main(int argc, char** argv)
// (4) multiply A,B
CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
vec_ops::VecOpsConfig<test_data> config{
vec_ops::VecOpsConfig<test_data> config {
ntt_config.ctx,
true, // is_a_on_device
true, // is_b_on_device
@@ -92,7 +92,8 @@ int main(int argc, char** argv)
false, // is_montgomery
false // is_async
};
CHK_IF_RETURN(vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));
CHK_IF_RETURN(
vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));
// (5) INTT (in place)
ntt_config.are_inputs_on_device = true;
@@ -117,7 +118,6 @@ int main(int argc, char** argv)
benchmark(false); // warmup
benchmark(true, 20);
ntt::ReleaseDomain<test_scalar>(ntt_config.ctx);
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
return 0;

View File

@@ -60,16 +60,10 @@ else()
endif()
project(icicle LANGUAGES CUDA CXX)
# Check CUDA version and, if possible, enable multi-threaded compilation
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
message(STATUS "Using multi-threaded CUDA compilation.")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0")
else()
message(STATUS "Can't use multi-threaded CUDA compilation.")
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
include_directories("${CMAKE_SOURCE_DIR}")
@@ -92,18 +86,10 @@ if (NOT IS_CURVE_SUPPORTED)
message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
endif ()
if (DEVMODE STREQUAL "ON")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O0 --ptxas-options=-O0 --ptxas-options=-allow-expensive-optimizations=false -DDEVMODE=ON")
endif ()
if (G2_DEFINED STREQUAL "ON")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2_DEFINED=ON")
endif ()
if (ECNTT_DEFINED STREQUAL "ON")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DECNTT_DEFINED=ON")
endif ()
option(BUILD_TESTS "Build tests" OFF)
if (NOT BUILD_TESTS)
@@ -118,9 +104,6 @@ if (NOT BUILD_TESTS)
if (NOT CURVE IN_LIST SUPPORTED_CURVES_WITHOUT_NTT)
list(APPEND ICICLE_SOURCES appUtils/ntt/ntt.cu)
list(APPEND ICICLE_SOURCES appUtils/ntt/kernel_ntt.cu)
if(ECNTT_DEFINED STREQUAL "ON")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DECNTT_DEFINED=ON")
endif()
endif()
add_library(

View File

@@ -1,2 +0,0 @@
test_keccak: test.cu keccak.cu
nvcc -o test_keccak -I. -I../.. test.cu

View File

@@ -1,275 +0,0 @@
#include "keccak.cuh"
namespace keccak {
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
{ \
t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
}
#define THETA( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
s00 ^= t0; \
s01 ^= t0; \
s02 ^= t0; \
s03 ^= t0; \
s04 ^= t0; \
\
s10 ^= t1; \
s11 ^= t1; \
s12 ^= t1; \
s13 ^= t1; \
s14 ^= t1; \
\
s20 ^= t2; \
s21 ^= t2; \
s22 ^= t2; \
s23 ^= t2; \
s24 ^= t2; \
\
s30 ^= t3; \
s31 ^= t3; \
s32 ^= t3; \
s33 ^= t3; \
s34 ^= t3; \
\
s40 ^= t4; \
s41 ^= t4; \
s42 ^= t4; \
s43 ^= t4; \
s44 ^= t4; \
}
#define RHOPI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = ROTL64(s10, (uint64_t)1); \
s10 = ROTL64(s11, (uint64_t)44); \
s11 = ROTL64(s41, (uint64_t)20); \
s41 = ROTL64(s24, (uint64_t)61); \
s24 = ROTL64(s42, (uint64_t)39); \
s42 = ROTL64(s04, (uint64_t)18); \
s04 = ROTL64(s20, (uint64_t)62); \
s20 = ROTL64(s22, (uint64_t)43); \
s22 = ROTL64(s32, (uint64_t)25); \
s32 = ROTL64(s43, (uint64_t)8); \
s43 = ROTL64(s34, (uint64_t)56); \
s34 = ROTL64(s03, (uint64_t)41); \
s03 = ROTL64(s40, (uint64_t)27); \
s40 = ROTL64(s44, (uint64_t)14); \
s44 = ROTL64(s14, (uint64_t)2); \
s14 = ROTL64(s31, (uint64_t)55); \
s31 = ROTL64(s13, (uint64_t)45); \
s13 = ROTL64(s01, (uint64_t)36); \
s01 = ROTL64(s30, (uint64_t)28); \
s30 = ROTL64(s33, (uint64_t)21); \
s33 = ROTL64(s23, (uint64_t)15); \
s23 = ROTL64(s12, (uint64_t)10); \
s12 = ROTL64(s21, (uint64_t)6); \
s21 = ROTL64(s02, (uint64_t)3); \
s02 = t0; \
}
#define KHI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = s00 ^ (~s10 & s20); \
t1 = s10 ^ (~s20 & s30); \
t2 = s20 ^ (~s30 & s40); \
t3 = s30 ^ (~s40 & s00); \
t4 = s40 ^ (~s00 & s10); \
s00 = t0; \
s10 = t1; \
s20 = t2; \
s30 = t3; \
s40 = t4; \
\
t0 = s01 ^ (~s11 & s21); \
t1 = s11 ^ (~s21 & s31); \
t2 = s21 ^ (~s31 & s41); \
t3 = s31 ^ (~s41 & s01); \
t4 = s41 ^ (~s01 & s11); \
s01 = t0; \
s11 = t1; \
s21 = t2; \
s31 = t3; \
s41 = t4; \
\
t0 = s02 ^ (~s12 & s22); \
t1 = s12 ^ (~s22 & s32); \
t2 = s22 ^ (~s32 & s42); \
t3 = s32 ^ (~s42 & s02); \
t4 = s42 ^ (~s02 & s12); \
s02 = t0; \
s12 = t1; \
s22 = t2; \
s32 = t3; \
s42 = t4; \
\
t0 = s03 ^ (~s13 & s23); \
t1 = s13 ^ (~s23 & s33); \
t2 = s23 ^ (~s33 & s43); \
t3 = s33 ^ (~s43 & s03); \
t4 = s43 ^ (~s03 & s13); \
s03 = t0; \
s13 = t1; \
s23 = t2; \
s33 = t3; \
s43 = t4; \
\
t0 = s04 ^ (~s14 & s24); \
t1 = s14 ^ (~s24 & s34); \
t2 = s24 ^ (~s34 & s44); \
t3 = s34 ^ (~s44 & s04); \
t4 = s44 ^ (~s04 & s14); \
s04 = t0; \
s14 = t1; \
s24 = t2; \
s34 = t3; \
s44 = t4; \
}
#define IOTA(element, rc) \
{ \
element ^= rc; \
}
__device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
__device__ void keccakf(uint64_t s[25])
{
uint64_t t0, t1, t2, t3, t4;
for (int i = 0; i < 24; i++) {
THETA(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
RHOPI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
KHI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
IOTA(s[0], RC[i]);
}
}
template <int C, int D>
__global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
{
int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (bid >= number_of_blocks) { return; }
const int r_bits = 1600 - C;
const int r_bytes = r_bits / 8;
const int d_bytes = D / 8;
uint8_t* b_input = input + bid * input_block_size;
uint8_t* b_output = output + bid * d_bytes;
uint64_t state[25] = {}; // Initialize with zeroes
int input_len = input_block_size;
// absorb
while (input_len >= r_bytes) {
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(b_input + i);
}
keccakf(state);
b_input += r_bytes;
input_len -= r_bytes;
}
// last block (if any)
uint8_t last_block[r_bytes];
for (int i = 0; i < input_len; i++) {
last_block[i] = b_input[i];
}
// pad 10*1
last_block[input_len] = 1;
for (int i = 0; i < r_bytes - input_len - 1; i++) {
last_block[input_len + i + 1] = 0;
}
// last bit
last_block[r_bytes - 1] |= 0x80;
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(last_block + i);
}
keccakf(state);
#pragma unroll
for (int i = 0; i < d_bytes; i += 8) {
*(uint64_t*)(b_output + i) = state[i / 8];
}
}
template <int C, int D>
cudaError_t
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = config.ctx.stream;
uint8_t* input_device;
if (config.are_inputs_on_device) {
input_device = input;
} else {
CHK_IF_RETURN(cudaMallocAsync(&input_device, number_of_blocks * input_block_size, stream));
CHK_IF_RETURN(
cudaMemcpyAsync(input_device, input, number_of_blocks * input_block_size, cudaMemcpyHostToDevice, stream));
}
uint8_t* output_device;
if (config.are_outputs_on_device) {
output_device = output;
} else {
CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_blocks * (D / 8), stream));
}
int number_of_threads = 1024;
int number_of_gpu_blocks = (number_of_blocks - 1) / number_of_threads + 1;
keccak_hash_blocks<C, D><<<number_of_gpu_blocks, number_of_threads, 0, stream>>>(
input_device, input_block_size, number_of_blocks, output_device);
if (!config.are_inputs_on_device) CHK_IF_RETURN(cudaFreeAsync(input_device, stream));
if (!config.are_outputs_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(output, output_device, number_of_blocks * (D / 8), cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
}
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
return CHK_LAST();
}
extern "C" cudaError_t
Keccak256(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
{
return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
}
extern "C" cudaError_t
Keccak512(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
{
return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
}
} // namespace keccak

View File

@@ -1,56 +0,0 @@
#pragma once
#ifndef KECCAK_H
#define KECCAK_H
#include <cstdint>
#include "utils/device_context.cuh"
#include "utils/error_handler.cuh"
namespace keccak {
/**
* @struct KeccakConfig
* Struct that encodes various Keccak parameters.
*/
struct KeccakConfig {
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
bool is_async; /**< Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
* non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
* function will block the current CPU thread. */
};
KeccakConfig default_keccak_config()
{
device_context::DeviceContext ctx = device_context::get_default_device_context();
KeccakConfig config = {
ctx, // ctx
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // is_async
};
return config;
}
/**
* Compute the keccak hash over a sequence of preimages.
* Takes {number_of_blocks * input_block_size} u64s of input and computes {number_of_blocks} outputs, each of size {D
* / 64} u64
* @tparam C - number of bits of capacity (c = b - r = 1600 - r). Only multiples of 64 are supported.
* @tparam D - number of bits of output. Only multiples of 64 are supported.
* @param input a pointer to the input data. May be allocated on device or on host, regulated
* by the config. Must be of size [input_block_size](@ref input_block_size) * [number_of_blocks](@ref
* number_of_blocks)}.
* @param input_block_size - size of each input block in bytes. Should be divisible by 8.
* @param number_of_blocks number of input and output blocks. One GPU thread processes one block
* @param output a pointer to the output data. May be allocated on device or on host, regulated
* by the config. Must be of size [output_block_size](@ref output_block_size) * [number_of_blocks](@ref
* number_of_blocks)}
*/
template <int C, int D>
cudaError_t
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config);
} // namespace keccak
#endif

View File

@@ -1,67 +0,0 @@
#include "utils/device_context.cuh"
#include "keccak.cu"
// #define DEBUG
#ifndef __CUDA_ARCH__
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <iomanip>
using namespace keccak;
#define D 256
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
void uint8ToHexString(const uint8_t* values, int size)
{
std::stringstream ss;
for (int i = 0; i < size; ++i) {
ss << std::hex << std::setw(2) << std::setfill('0') << (int)values[i];
}
std::string hexString = ss.str();
std::cout << hexString << std::endl;
}
int main(int argc, char* argv[])
{
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
START_TIMER(allocation_timer);
// Prepare input data of [0, 1, 2 ... (number_of_blocks * input_block_size) - 1]
int number_of_blocks = argc > 1 ? 1 << atoi(argv[1]) : 1024;
int input_block_size = argc > 2 ? atoi(argv[2]) : 136;
uint8_t* in_ptr = static_cast<uint8_t*>(malloc(number_of_blocks * input_block_size));
for (uint64_t i = 0; i < number_of_blocks * input_block_size; i++) {
in_ptr[i] = (uint8_t)i;
}
END_TIMER(allocation_timer, "Allocate mem and fill input");
uint8_t* out_ptr = static_cast<uint8_t*>(malloc(number_of_blocks * (D / 8)));
START_TIMER(keccak_timer);
KeccakConfig config = default_keccak_config();
Keccak256(in_ptr, input_block_size, number_of_blocks, out_ptr, config);
END_TIMER(keccak_timer, "Keccak")
for (int i = 0; i < number_of_blocks; i++) {
#ifdef DEBUG
uint8ToHexString(out_ptr + i * (D / 8), D / 8);
#endif
}
free(in_ptr);
free(out_ptr);
}
#endif

View File

@@ -1,4 +1,4 @@
test_msm:
mkdir -p work
nvcc -o work/test_msm -std=c++17 -I. -I../.. tests/msm_test.cu
work/test_msm
work/test_msm

View File

@@ -25,20 +25,10 @@ namespace msm {
#define MAX_TH 256
// #define SIGNED_DIG //WIP
// #define BIG_TRIANGLE
// #define SSM_SUM //WIP
template <typename A, typename P>
__global__ void left_shift_kernel(A* points, const unsigned shift, const unsigned count, A* points_out)
{
const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= count) return;
P point = P::from_affine(points[tid]);
for (unsigned i = 0; i < shift; i++)
point = P::dbl(point);
points_out[tid] = P::to_affine(point);
}
unsigned get_optimal_c(int bitsize) { return (unsigned)max(ceil(std::log2(bitsize)) - 4.0, 1.0); }
unsigned get_optimal_c(int bitsize) { return max((unsigned)ceil(log2(bitsize)) - 4, 1U); }
template <typename E>
__global__ void normalize_kernel(E* inout, E factor, int n)
@@ -158,38 +148,47 @@ namespace msm {
__global__ void split_scalars_kernel(
unsigned* buckets_indices,
unsigned* point_indices,
const S* scalars,
S* scalars,
unsigned nof_scalars,
unsigned points_size,
unsigned msm_size,
unsigned nof_bms,
unsigned bm_bitsize,
unsigned c,
unsigned precomputed_bms_stride)
unsigned c)
{
// constexpr unsigned sign_mask = 0x80000000;
// constexpr unsigned trash_bucket = 0x80000000;
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nof_scalars) return;
unsigned bucket_index;
// unsigned bucket_index2;
unsigned current_index;
unsigned msm_index = tid / msm_size;
const S& scalar = scalars[tid];
// unsigned borrow = 0;
S& scalar = scalars[tid];
for (unsigned bm = 0; bm < nof_bms; bm++) {
const unsigned precomputed_index = bm / precomputed_bms_stride;
const unsigned target_bm = bm % precomputed_bms_stride;
bucket_index = scalar.get_scalar_digit(bm, c);
current_index = bm * nof_scalars + tid;
if (bucket_index != 0) {
buckets_indices[current_index] =
(msm_index << (c + bm_bitsize)) | (target_bm << c) |
bucket_index; // the bucket module number and the msm number are appended at the msbs
} else {
buckets_indices[current_index] = 0; // will be skipped
#ifdef SIGNED_DIG
bucket_index += borrow;
borrow = 0;
unsigned sign = 0;
if (bucket_index > (1 << (c - 1))) {
bucket_index = (1 << c) - bucket_index;
borrow = 1;
sign = sign_mask;
}
point_indices[current_index] =
tid % points_size + points_size * precomputed_index; // the point index is saved for later
#endif
current_index = bm * nof_scalars + tid;
#ifdef SIGNED_DIG
point_indices[current_index] = sign | tid; // the point index is saved for later
#else
buckets_indices[current_index] =
(msm_index << (c + bm_bitsize)) | (bm << c) |
bucket_index; // the bucket module number and the msm number are appended at the msbs
if (bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped
point_indices[current_index] = tid % points_size; // the point index is saved for later
#endif
}
}
@@ -224,11 +223,19 @@ namespace msm {
const unsigned msm_idx_shift,
const unsigned c)
{
// constexpr unsigned sign_mask = 0x80000000;
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nof_buckets_to_compute) return;
#ifdef SIGNED_DIG // todo - fix
const unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
const unsigned bm_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)) >> c;
const unsigned bucket_index =
msm_index * nof_buckets + bm_index * ((1 << (c - 1)) + 1) + (single_bucket_indices[tid] & ((1 << c) - 1));
#else
unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
const unsigned single_bucket_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
unsigned bucket_index = msm_index * nof_buckets + single_bucket_index;
#endif
const unsigned bucket_offset = bucket_offsets[tid];
const unsigned bucket_size = bucket_sizes[tid];
@@ -236,7 +243,14 @@ namespace msm {
for (unsigned i = 0; i < bucket_size;
i++) { // add the relevant points starting from the relevant offset up to the bucket size
unsigned point_ind = point_indices[bucket_offset + i];
#ifdef SIGNED_DIG
unsigned sign = point_ind & sign_mask;
point_ind &= ~sign_mask;
A point = points[point_ind];
if (sign) point = A::neg(point);
#else
A point = points[point_ind];
#endif
bucket =
i ? (point == A::zero() ? bucket : bucket + point) : (point == A::zero() ? P::zero() : P::from_affine(point));
}
@@ -303,7 +317,11 @@ namespace msm {
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nof_bms) return;
#ifdef SIGNED_DIG
unsigned buckets_in_bm = (1 << c) + 1;
#else
unsigned buckets_in_bm = (1 << c);
#endif
P line_sum = buckets[(tid + 1) * buckets_in_bm - 1];
final_sums[tid] = line_sum;
for (unsigned i = buckets_in_bm - 2; i > 0; i--) {
@@ -360,8 +378,8 @@ namespace msm {
cudaError_t bucket_method_msm(
unsigned bitsize,
unsigned c,
const S* scalars,
const A* points,
S* scalars,
A* points,
unsigned batch_size, // number of MSMs to compute
unsigned single_msm_size, // number of elements per MSM (a.k.a N)
unsigned nof_points, // number of EC points in 'points' array. Must be either (1) single_msm_size if MSMs are
@@ -374,7 +392,6 @@ namespace msm {
bool are_results_on_device,
bool is_big_triangle,
int large_bucket_factor,
int precompute_factor,
bool is_async,
cudaStream_t stream)
{
@@ -386,59 +403,44 @@ namespace msm {
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument, "bucket_method_msm: #points must be divisible by single_msm_size*batch_size");
}
if ((precompute_factor & (precompute_factor - 1)) != 0) {
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument,
"bucket_method_msm: precompute factors that are not powers of 2 currently unsupported");
}
const S* d_scalars;
S* d_allocated_scalars = nullptr;
S* d_scalars;
if (!are_scalars_on_device) {
// copy scalars to gpu
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_scalars, sizeof(S) * nof_scalars, stream));
CHK_IF_RETURN(
cudaMemcpyAsync(d_allocated_scalars, scalars, sizeof(S) * nof_scalars, cudaMemcpyHostToDevice, stream));
if (are_scalars_montgomery_form) {
CHK_IF_RETURN(mont::FromMontgomery(d_allocated_scalars, nof_scalars, stream, d_allocated_scalars));
}
d_scalars = d_allocated_scalars;
} else { // already on device
if (are_scalars_montgomery_form) {
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_scalars, sizeof(S) * nof_scalars, stream));
CHK_IF_RETURN(mont::FromMontgomery(scalars, nof_scalars, stream, d_allocated_scalars));
d_scalars = d_allocated_scalars;
} else {
d_scalars = scalars;
}
CHK_IF_RETURN(cudaMallocAsync(&d_scalars, sizeof(S) * nof_scalars, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * nof_scalars, cudaMemcpyHostToDevice, stream));
} else {
d_scalars = scalars;
}
unsigned total_bms_per_msm = (bitsize + c - 1) / c;
unsigned nof_bms_per_msm = (total_bms_per_msm - 1) / precompute_factor + 1;
unsigned input_indexes_count = nof_scalars * total_bms_per_msm;
unsigned bm_bitsize = (unsigned)ceil(std::log2(nof_bms_per_msm));
if (are_scalars_montgomery_form) {
if (are_scalars_on_device) {
S* d_mont_scalars;
CHK_IF_RETURN(cudaMallocAsync(&d_mont_scalars, sizeof(S) * nof_scalars, stream));
CHK_IF_RETURN(mont::FromMontgomery(d_scalars, nof_scalars, stream, d_mont_scalars));
d_scalars = d_mont_scalars;
} else
CHK_IF_RETURN(mont::FromMontgomery(d_scalars, nof_scalars, stream, d_scalars));
}
unsigned nof_bms_per_msm = (bitsize + c - 1) / c;
unsigned* bucket_indices;
unsigned* point_indices;
unsigned* sorted_bucket_indices;
unsigned* sorted_point_indices;
CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * input_indexes_count, stream));
CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * input_indexes_count, stream));
CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * input_indexes_count, stream));
CHK_IF_RETURN(cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * input_indexes_count, stream));
CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
CHK_IF_RETURN(cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms_per_msm));
// split scalars into digits
unsigned NUM_THREADS = 1 << 10;
unsigned NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS;
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
bucket_indices, point_indices, d_scalars, nof_scalars, nof_points, single_msm_size, total_bms_per_msm,
bm_bitsize, c, nof_bms_per_msm);
nof_points *= precompute_factor;
bucket_indices, point_indices, d_scalars, nof_scalars, nof_points, single_msm_size, nof_bms_per_msm, bm_bitsize,
c);
// ------------------------------ Sorting routines for scalars start here ----------------------------------
// sort indices - the indices are sorted from smallest to largest in order to group together the points that
// belong to each bucket
unsigned* sort_indices_temp_storage{};
@@ -448,22 +450,26 @@ namespace msm {
// more info
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices,
point_indices, sorted_point_indices, input_indexes_count, 0, sizeof(unsigned) * 8, stream));
point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream));
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
// more info
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices,
point_indices, sorted_point_indices, input_indexes_count, 0, sizeof(unsigned) * 8, stream));
point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream));
CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream));
CHK_IF_RETURN(cudaFreeAsync(point_indices, stream));
// compute number of bucket modules and number of buckets in each module
unsigned nof_bms_in_batch = nof_bms_per_msm * batch_size;
#ifdef SIGNED_DIG
const unsigned nof_buckets = nof_bms_per_msm * ((1 << (c - 1)) + 1); // signed digits
#else
// minus nof_bms_per_msm because zero bucket is not included in each bucket module
const unsigned nof_buckets = (nof_bms_per_msm << c) - nof_bms_per_msm;
#endif
const unsigned total_nof_buckets = nof_buckets * batch_size;
// find bucket_sizes
@@ -478,11 +484,11 @@ namespace msm {
size_t encode_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, input_indexes_count, stream));
nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream));
CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, input_indexes_count, stream));
nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream));
CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_indices, stream));
@@ -498,33 +504,28 @@ namespace msm {
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets + 1, stream));
CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream));
// ----------- Starting to upload points (if they were on host) in parallel to scalar sorting ----------------
const A* d_points;
A* d_allocated_points = nullptr;
cudaStream_t stream_points = nullptr;
A* d_points;
cudaStream_t stream_points;
if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points));
if (!are_points_on_device) {
// copy points to gpu
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_points, sizeof(A) * nof_points, stream_points));
CHK_IF_RETURN(
cudaMemcpyAsync(d_allocated_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points));
if (are_points_montgomery_form) {
CHK_IF_RETURN(mont::FromMontgomery(d_allocated_points, nof_points, stream_points, d_allocated_points));
}
d_points = d_allocated_points;
} else { // already on device
if (are_points_montgomery_form) {
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_points, sizeof(A) * nof_points, stream_points));
CHK_IF_RETURN(mont::FromMontgomery(points, nof_points, stream_points, d_allocated_points));
d_points = d_allocated_points;
} else {
d_points = points;
}
CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points));
CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points));
} else {
d_points = points;
}
if (are_points_montgomery_form) {
if (are_points_on_device) {
A* d_mont_points;
CHK_IF_RETURN(cudaMallocAsync(&d_mont_points, sizeof(A) * nof_points, stream_points));
CHK_IF_RETURN(mont::FromMontgomery(d_points, nof_points, stream_points, d_mont_points));
d_points = d_mont_points;
} else
CHK_IF_RETURN(mont::FromMontgomery(d_points, nof_points, stream_points, d_points));
}
cudaEvent_t event_points_uploaded;
if (stream_points) {
if (!are_points_on_device || are_points_montgomery_form) {
CHK_IF_RETURN(cudaEventCreateWithFlags(&event_points_uploaded, cudaEventDisableTiming));
CHK_IF_RETURN(cudaEventRecord(event_points_uploaded, stream_points));
}
@@ -608,7 +609,7 @@ namespace msm {
cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(nof_large_buckets, stream));
if (stream_points) {
if (!are_points_on_device || are_points_montgomery_form) {
// by this point, points need to be already uploaded and un-Montgomeried
CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_points_uploaded));
CHK_IF_RETURN(cudaEventDestroy(event_points_uploaded));
@@ -617,7 +618,7 @@ namespace msm {
cudaStream_t stream_large_buckets;
cudaEvent_t event_large_buckets_accumulated;
// ---------------- This is where handling of large buckets happens (if there are any) -------------
// this is where handling of large buckets happens (if there are any)
if (h_nof_large_buckets > 0 && bucket_th > 0) {
CHK_IF_RETURN(cudaStreamCreate(&stream_large_buckets));
CHK_IF_RETURN(cudaEventCreateWithFlags(&event_large_buckets_accumulated, cudaEventDisableTiming));
@@ -650,10 +651,10 @@ namespace msm {
// buckets
unsigned large_buckets_nof_threads =
(h_nof_pts_in_large_buckets + average_bucket_size - 1) / average_bucket_size + h_nof_large_buckets;
unsigned log_nof_large_buckets = (unsigned)ceil(std::log2(h_nof_large_buckets));
unsigned log_nof_large_buckets = (unsigned)ceil(log2(h_nof_large_buckets));
unsigned* large_bucket_indices;
CHK_IF_RETURN(cudaMallocAsync(&large_bucket_indices, sizeof(unsigned) * large_buckets_nof_threads, stream));
NUM_THREADS = max(1, min(1 << 8, h_nof_large_buckets));
NUM_THREADS = min(1 << 8, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
initialize_large_bucket_indices<P><<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets, log_nof_large_buckets,
@@ -662,24 +663,24 @@ namespace msm {
P* large_buckets;
CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(P) * large_buckets_nof_threads, stream_large_buckets));
NUM_THREADS = max(1, min(1 << 8, large_buckets_nof_threads));
NUM_THREADS = min(1 << 8, large_buckets_nof_threads);
NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS;
accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
large_buckets, sorted_bucket_offsets, sorted_bucket_sizes, large_bucket_indices, sorted_point_indices,
d_points, h_nof_large_buckets, c, average_bucket_size, log_nof_large_buckets, large_buckets_nof_threads);
NUM_THREADS = max(1, min(MAX_TH, h_nof_large_buckets));
NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
// normalization is needed to update buckets sizes and offsets due to reduction that already took place
normalize_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets);
// reduce
for (int s = h_largest_bucket; s > 1; s = ((s + 1) >> 1)) {
NUM_THREADS = max(1, min(MAX_TH, h_nof_large_buckets));
NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
normalize_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
sorted_bucket_sizes, s == h_largest_bucket ? average_bucket_size : 2, h_nof_large_buckets);
NUM_THREADS = max(1, min(MAX_TH, large_buckets_nof_threads));
NUM_THREADS = min(MAX_TH, large_buckets_nof_threads);
NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS;
sum_reduction_variable_size_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
large_buckets, sorted_bucket_sizes_sum, sorted_bucket_sizes, large_bucket_indices,
@@ -688,7 +689,7 @@ namespace msm {
CHK_IF_RETURN(cudaFreeAsync(large_bucket_indices, stream_large_buckets));
// distribute
NUM_THREADS = max(1, min(MAX_TH, h_nof_large_buckets));
NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
distribute_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
large_buckets, buckets, sorted_bucket_sizes_sum, sorted_single_bucket_indices, h_nof_large_buckets,
@@ -699,11 +700,10 @@ namespace msm {
CHK_IF_RETURN(cudaEventRecord(event_large_buckets_accumulated, stream_large_buckets));
}
// ------------------------- Accumulation of (non-large) buckets ---------------------------------
// launch the accumulation kernel with maximum threads
if (h_nof_buckets_to_compute > h_nof_large_buckets) {
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (h_nof_buckets_to_compute - h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
// launch the accumulation kernel with maximum threads
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
buckets, sorted_bucket_offsets + h_nof_large_buckets, sorted_bucket_sizes + h_nof_large_buckets,
sorted_single_bucket_indices + h_nof_large_buckets, sorted_point_indices, d_points,
@@ -719,11 +719,24 @@ namespace msm {
CHK_IF_RETURN(cudaStreamDestroy(stream_large_buckets));
}
P* d_allocated_final_result = nullptr;
if (!are_results_on_device)
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_final_result, sizeof(P) * batch_size, stream));
#ifdef SSM_SUM
// sum each bucket
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
ssm_buckets_kernel<fake_point, fake_scalar>
<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);
// sum each bucket module
P* final_results;
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_per_msm, stream));
NUM_THREADS = 1 << c;
NUM_BLOCKS = nof_bms_per_msm;
sum_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results);
#endif
P* d_final_result;
if (!are_results_on_device) CHK_IF_RETURN(cudaMallocAsync(&d_final_result, sizeof(P) * batch_size, stream));
// --- Reduction of buckets happens here, after this we'll get a single sum for each bucket module/window ---
unsigned nof_empty_bms_per_batch = 0; // for non-triangle accumluation this may be >0
P* final_results;
if (is_big_triangle || c == 1) {
@@ -731,9 +744,15 @@ namespace msm {
// launch the bucket module sum kernel - a thread for each bucket module
NUM_THREADS = 32;
NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
#ifdef SIGNED_DIG
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
buckets, final_results, nof_bms_in_batch, c - 1); // sighed digits
#else
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms_in_batch, c);
#endif
} else {
unsigned source_bits_count = c;
// bool odd_source_c = source_bits_count % 2;
unsigned source_windows_count = nof_bms_per_msm;
unsigned source_buckets_count = nof_buckets + nof_bms_per_msm;
unsigned target_windows_count = 0;
@@ -759,7 +778,7 @@ namespace msm {
const bool is_last_iter = (j == target_bits_count - 1);
unsigned nof_threads =
(((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size;
NUM_THREADS = max(1, min(MAX_TH, nof_threads));
NUM_THREADS = min(MAX_TH, nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1,
@@ -773,11 +792,12 @@ namespace msm {
}
}
if (target_bits_count == 1) {
// Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed
// to be empty when target_windows_count>bitsize. for example consider bitsize=253 and c=2. The reduction
// ends with 254 bms but the most significant one is guaranteed to be zero since the scalars are 253b.
// Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed to
// be empty when target_windows_count>bitsize.
// for example consider bitsize=253 and c=2. The reduction ends with 254 bms but the most significant one is
// guaranteed to be zero since the scalars are 253b.
nof_bms_per_msm = target_windows_count;
nof_empty_bms_per_batch = target_windows_count > bitsize ? target_windows_count - bitsize : 0;
nof_empty_bms_per_batch = target_windows_count - bitsize;
nof_bms_in_batch = nof_bms_per_msm * batch_size;
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
@@ -799,29 +819,28 @@ namespace msm {
temp_buckets1 = nullptr;
temp_buckets2 = nullptr;
source_bits_count = target_bits_count;
// odd_source_c = source_bits_count % 2;
source_windows_count = target_windows_count;
source_buckets_count = target_buckets_count;
}
}
// ------- This is the final stage where bucket modules/window sums get added up with appropriate weights
// -------
// launch the double and add kernel, a single thread per batch element
NUM_THREADS = 32;
NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
// launch the double and add kernel, a single thread per batch element
final_accumulation_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size, nof_bms_per_msm,
final_results, are_results_on_device ? final_result : d_final_result, batch_size, nof_bms_per_msm,
nof_empty_bms_per_batch, c);
CHK_IF_RETURN(cudaFreeAsync(final_results, stream));
if (!are_results_on_device)
CHK_IF_RETURN(cudaMemcpyAsync(
final_result, d_allocated_final_result, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(
cudaMemcpyAsync(final_result, d_final_result, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream));
// free memory
if (d_allocated_scalars) CHK_IF_RETURN(cudaFreeAsync(d_allocated_scalars, stream));
if (d_allocated_points) CHK_IF_RETURN(cudaFreeAsync(d_allocated_points, stream));
if (d_allocated_final_result) CHK_IF_RETURN(cudaFreeAsync(d_allocated_final_result, stream));
if (!are_scalars_on_device || are_scalars_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream));
if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_points, stream));
if (!are_results_on_device) CHK_IF_RETURN(cudaFreeAsync(d_final_result, stream));
CHK_IF_RETURN(cudaFreeAsync(buckets, stream));
if (!is_async) CHK_IF_RETURN(cudaStreamSynchronize(stream));
@@ -854,7 +873,7 @@ namespace msm {
}
template <typename S, typename A, typename P>
cudaError_t MSM(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results)
cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results)
{
const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize;
cudaStream_t& stream = config.ctx.stream;
@@ -871,59 +890,7 @@ namespace msm {
bitsize, c, scalars, points, config.batch_size, msm_size,
(config.points_size == 0) ? msm_size : config.points_size, results, config.are_scalars_on_device,
config.are_scalars_montgomery_form, config.are_points_on_device, config.are_points_montgomery_form,
config.are_results_on_device, config.is_big_triangle, config.large_bucket_factor, config.precompute_factor,
config.is_async, stream));
}
template <typename A, typename P>
cudaError_t PrecomputeMSMBases(
A* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
A* output_bases)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = ctx.stream;
CHK_IF_RETURN(cudaMemcpyAsync(
output_bases, bases, sizeof(A) * bases_size,
are_bases_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, stream));
unsigned c = 16;
unsigned total_nof_bms = (P::SCALAR_FF_NBITS - 1) / c + 1;
unsigned shift = c * ((total_nof_bms - 1) / precompute_factor + 1);
unsigned NUM_THREADS = 1 << 8;
unsigned NUM_BLOCKS = (bases_size + NUM_THREADS - 1) / NUM_THREADS;
for (int i = 1; i < precompute_factor; i++) {
left_shift_kernel<A, P><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
&output_bases[(i - 1) * bases_size], shift, bases_size, &output_bases[i * bases_size]);
}
return CHK_LAST();
}
/**
* Extern "C" version of [PrecomputeMSMBases](@ref PrecomputeMSMBases) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):
* - `A` is the [affine representation](@ref affine_t) of curve points;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, PrecomputeMSMBases)(
curve_config::affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
curve_config::affine_t* output_bases)
{
return PrecomputeMSMBases<curve_config::affine_t, curve_config::projective_t>(
bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
config.are_results_on_device, config.is_big_triangle, config.large_bucket_factor, config.is_async, stream));
}
/**
@@ -935,8 +902,8 @@ namespace msm {
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, MSMCuda)(
const curve_config::scalar_t* scalars,
const curve_config::affine_t* points,
curve_config::scalar_t* scalars,
curve_config::affine_t* points,
int msm_size,
MSMConfig& config,
curve_config::projective_t* out)
@@ -945,26 +912,12 @@ namespace msm {
scalars, points, msm_size, config, out);
}
#if defined(G2_DEFINED)
/**
* Extern "C" version of [PrecomputeMSMBases](@ref PrecomputeMSMBases) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):
* - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
* Extern "C" version of [DefaultMSMConfig](@ref DefaultMSMConfig) function.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2PrecomputeMSMBases)(
curve_config::g2_affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
curve_config::g2_affine_t* output_bases)
{
return PrecomputeMSMBases<curve_config::g2_affine_t, curve_config::g2_projective_t>(
bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
}
extern "C" MSMConfig CONCAT_EXPAND(CURVE, DefaultMSMConfig)() { return DefaultMSMConfig<curve_config::affine_t>(); }
#if defined(G2_DEFINED)
/**
* Extern "C" version of [MSM](@ref MSM) function with the following values of template parameters
@@ -975,8 +928,8 @@ namespace msm {
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2MSMCuda)(
const curve_config::scalar_t* scalars,
const curve_config::g2_affine_t* points,
curve_config::scalar_t* scalars,
curve_config::g2_affine_t* points,
int msm_size,
MSMConfig& config,
curve_config::g2_projective_t* out)
@@ -985,6 +938,15 @@ namespace msm {
scalars, points, msm_size, config, out);
}
/**
* Extern "C" version of [DefaultMSMConfig](@ref DefaultMSMConfig) function for the G2 curve
* (functionally no different than the default MSM config function for G1).
*/
extern "C" MSMConfig CONCAT_EXPAND(CURVE, G2DefaultMSMConfig)()
{
return DefaultMSMConfig<curve_config::g2_affine_t>();
}
#endif
} // namespace msm
} // namespace msm

View File

@@ -4,12 +4,12 @@
#include <cuda_runtime.h>
#include "curves/curve_config.cuh"
#include "primitives/affine.cuh"
#include "primitives/field.cuh"
#include "primitives/projective.cuh"
#include "utils/device_context.cuh"
#include "utils/error_handler.cuh"
#include "../../curves/curve_config.cuh"
#include "../../primitives/affine.cuh"
#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"
#include "../../utils/device_context.cuh"
#include "../../utils/error_handler.cuh"
/**
* @namespace msm
@@ -43,18 +43,14 @@ namespace msm {
* variable is set equal to the MSM size. And if every MSM uses a distinct set of
* points, it should be set to the product of MSM size and [batch_size](@ref
* batch_size). Default value: 0 (meaning it's equal to the MSM size). */
int precompute_factor; /**< The number of extra points to pre-compute for each point. See the
* [PrecomputeMSMBases](@ref PrecomputeMSMBases) function, `precompute_factor` passed
* there needs to be equal to the one used here. Larger values decrease the
int precompute_factor; /**< The number of extra points to pre-compute for each point. Larger values decrease the
* number of computations to make, on-line memory footprint, but increase the static
* memory footprint. Default value: 1 (i.e. don't pre-compute). */
int c; /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
* method" that we use to solve the MSM problem. As a rule of thumb, larger value
* means more on-line memory footprint but also more parallelism and less computational
* complexity (up to a certain point). Currently pre-computation is independent of
* \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
* [PrecomputeMSMBases](@ref PrecomputeMSMBases) function will need to be identical.
* Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */
* complexity (up to a certain point). Default value: 0 (the optimal value of \f$ c \f$
* is chosen automatically). */
int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
* but if a different (better) upper bound is known, it should be reflected in this
* variable. Default value: 0 (set to the bitsize of scalar field). */
@@ -105,39 +101,12 @@ namespace msm {
* Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) point in our codebase.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*
* **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any
* effect: `precompute_factor` (always equals 1) and `ctx.device_id` (0 device is always used).
* Also, it's currently better to use `batch_size=1` in most cases (except with dealing with very many MSMs).
*/
template <typename S, typename A, typename P>
cudaError_t MSM(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);
/**
* A function that precomputes MSM bases by extending them with their shifted copies.
* e.g.:
* Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
* Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
* 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
* @param bases Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
* @param bases_size Number of bases.
* @param precompute_factor The number of total precomputed points for each base (including the base itself).
* @param _c This is currently unused, but in the future precomputation will need to be aware of
* the `c` value used in MSM (see [MSMConfig](@ref MSMConfig)). So to avoid breaking your code with this
* upcoming change, make sure to use the same value of `c` in this function and in respective MSMConfig.
* @param are_bases_on_device Whether the bases are on device.
* @param ctx Device context specifying device id and stream to use.
* @param output_bases Device-allocated buffer of size bases_size * precompute_factor for the extended bases.
* @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
* Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*
*/
template <typename A, typename P>
cudaError_t PrecomputeMSMBases(
A* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
A* output_bases);
cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results);
} // namespace msm

View File

@@ -6,10 +6,11 @@
#include <iostream>
#include <vector>
#include "curves/curve_config.cuh"
#include "primitives/field.cuh"
#include "primitives/projective.cuh"
#include "utils/device_context.cuh"
#include "../../curves/curve_config.cuh"
#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"
#include "../../utils/cuda_utils.cuh"
#include "../../utils/device_context.cuh"
class Dummy_Scalar
{
@@ -30,7 +31,7 @@ public:
return os;
}
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width)
{
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
}
@@ -86,7 +87,7 @@ public:
{
Dummy_Projective res = zero();
#ifdef CUDA_ARCH
UNROLL
#pragma unroll
#endif
for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
if (i > 0) { res = res + res; }

View File

@@ -4,10 +4,3 @@ build_verification:
test_verification: build_verification
work/test_verification
build_verification_ecntt:
mkdir -p work
nvcc -o work/test_verification_ecntt -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17 -DECNTT_DEFINED
test_verification_ecntt: build_verification_ecntt
work/test_verification_ecntt

View File

@@ -2,9 +2,9 @@
#include "appUtils/ntt/thread_ntt.cu"
#include "curves/curve_config.cuh"
#include "utils/sharedmem.cuh"
#include "appUtils/ntt/ntt.cuh" // for ntt::Ordering
#include "appUtils/ntt/ntt.cuh" // for Ordering
namespace mxntt {
namespace ntt {
static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
{
@@ -56,15 +56,7 @@ namespace mxntt {
// Note: the following reorder kernels are fused with normalization for INTT
template <typename E, typename S, uint32_t MAX_GROUP_SIZE = 80>
static __global__ void reorder_digits_inplace_and_normalize_kernel(
E* arr,
uint32_t log_size,
bool columns_batch,
uint32_t batch_size,
bool dit,
bool fast_tw,
eRevType rev_type,
bool is_normalize,
S inverse_N)
E* arr, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type, bool is_normalize, S inverse_N)
{
// launch N threads (per batch element)
// each thread starts from one index and calculates the corresponding group
@@ -73,20 +65,19 @@ namespace mxntt {
const uint32_t size = 1 << log_size;
const uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
const uint32_t idx = columns_batch ? tid / batch_size : tid % size;
const uint32_t batch_idx = columns_batch ? tid % batch_size : tid / size;
if (tid >= size * batch_size) return;
const uint32_t idx = tid % size;
const uint32_t batch_idx = tid / size;
uint32_t next_element = idx;
uint32_t group[MAX_GROUP_SIZE];
group[0] = columns_batch ? next_element * batch_size + batch_idx : next_element + size * batch_idx;
group[0] = next_element + size * batch_idx;
uint32_t i = 1;
for (; i < MAX_GROUP_SIZE;) {
next_element = generalized_rev(next_element, log_size, dit, fast_tw, rev_type);
if (next_element < idx) return; // not handling this group
if (next_element == idx) break; // calculated whole group
group[i++] = columns_batch ? next_element * batch_size + batch_idx : next_element + size * batch_idx;
group[i++] = next_element + size * batch_idx;
}
--i;
@@ -100,12 +91,9 @@ namespace mxntt {
template <typename E, typename S>
__launch_bounds__(64) __global__ void reorder_digits_and_normalize_kernel(
const E* arr,
E* arr,
E* arr_reordered,
uint32_t log_size,
bool columns_batch,
uint32_t batch_size,
uint32_t columns_batch_size,
bool dit,
bool fast_tw,
eRevType rev_type,
@@ -113,46 +101,41 @@ namespace mxntt {
S inverse_N)
{
uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= (1 << log_size) * batch_size) return;
uint32_t rd = tid;
uint32_t wr = (columns_batch ? 0 : ((tid >> log_size) << log_size)) +
generalized_rev((tid / columns_batch_size) & ((1 << log_size) - 1), log_size, dit, fast_tw, rev_type);
arr_reordered[wr * columns_batch_size + (tid % columns_batch_size)] = is_normalize ? arr[rd] * inverse_N : arr[rd];
uint32_t wr =
((tid >> log_size) << log_size) + generalized_rev(tid & ((1 << log_size) - 1), log_size, dit, fast_tw, rev_type);
arr_reordered[wr] = is_normalize ? arr[rd] * inverse_N : arr[rd];
}
template <typename E, typename S>
static __global__ void batch_elementwise_mul_with_reorder_kernel(
const E* in_vec,
uint32_t size,
bool columns_batch,
uint32_t batch_size,
uint32_t columns_batch_size,
static __global__ void batch_elementwise_mul_with_reorder(
E* in_vec,
int n_elements,
int batch_size,
S* scalar_vec,
int step,
int n_scalars,
uint32_t log_size,
int logn,
eRevType rev_type,
bool dit,
E* out_vec)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= size * batch_size) return;
int64_t scalar_id = (tid / columns_batch_size) % size;
if (rev_type != eRevType::None)
scalar_id = generalized_rev((tid / columns_batch_size) & ((1 << log_size) - 1), log_size, dit, false, rev_type);
if (tid >= n_elements * batch_size) return;
int64_t scalar_id = tid % n_elements;
if (rev_type != eRevType::None) scalar_id = generalized_rev(tid, logn, dit, false, rev_type);
out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
}
template <typename E, typename S>
__launch_bounds__(64) __global__ void ntt64(
const E* in,
E* in,
E* out,
S* external_twiddles,
S* internal_twiddles,
S* basic_twiddles,
uint32_t log_size,
uint32_t tw_log_size,
uint32_t columns_batch_size,
uint32_t nof_ntt_blocks,
uint32_t data_stride,
uint32_t log_data_stride,
@@ -170,27 +153,19 @@ namespace mxntt {
s_meta.th_stride = 8;
s_meta.ntt_block_size = 64;
s_meta.ntt_block_id = columns_batch_size ? blockIdx.x / ((columns_batch_size + 7) / 8)
: (blockIdx.x << 3) + (strided ? (threadIdx.x & 0x7) : (threadIdx.x >> 3));
s_meta.ntt_block_id = (blockIdx.x << 3) + (strided ? (threadIdx.x & 0x7) : (threadIdx.x >> 3));
s_meta.ntt_inp_id = strided ? (threadIdx.x >> 3) : (threadIdx.x & 0x7);
s_meta.batch_id =
columns_batch_size ? (threadIdx.x & 0x7) + ((blockIdx.x % ((columns_batch_size + 7) / 8)) << 3) : 0;
if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
return;
if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
if (fast_tw)
engine.loadBasicTwiddles(basic_twiddles);
else
engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
if (columns_batch_size)
engine.loadGlobalDataColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.loadGlobalData(in, data_stride, log_data_stride, strided, s_meta);
engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
if (twiddle_stride && dit) {
if (fast_tw)
engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, s_meta);
engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
else
engine.loadExternalTwiddlesGeneric64(
external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
@@ -214,28 +189,24 @@ namespace mxntt {
if (twiddle_stride && !dit) {
if (fast_tw)
engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, s_meta);
engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
else
engine.loadExternalTwiddlesGeneric64(
external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
engine.twiddlesExternal();
}
if (columns_batch_size)
engine.storeGlobalDataColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.storeGlobalData(out, data_stride, log_data_stride, strided, s_meta);
engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
}
template <typename E, typename S>
__launch_bounds__(64) __global__ void ntt32(
const E* in,
E* in,
E* out,
S* external_twiddles,
S* internal_twiddles,
S* basic_twiddles,
uint32_t log_size,
uint32_t tw_log_size,
uint32_t columns_batch_size,
uint32_t nof_ntt_blocks,
uint32_t data_stride,
uint32_t log_data_stride,
@@ -254,25 +225,16 @@ namespace mxntt {
s_meta.th_stride = 4;
s_meta.ntt_block_size = 32;
s_meta.ntt_block_id = columns_batch_size ? blockIdx.x / ((columns_batch_size + 15) / 16)
: (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
s_meta.ntt_block_id = (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
s_meta.ntt_inp_id = strided ? (threadIdx.x >> 4) : (threadIdx.x & 0x3);
s_meta.batch_id =
columns_batch_size ? (threadIdx.x & 0xf) + ((blockIdx.x % ((columns_batch_size + 15) / 16)) << 4) : 0;
if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
return;
if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
if (fast_tw)
engine.loadBasicTwiddles(basic_twiddles);
else
engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
if (columns_batch_size)
engine.loadGlobalDataColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.loadGlobalData(in, data_stride, log_data_stride, strided, s_meta);
engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
if (fast_tw)
engine.loadInternalTwiddles32(internal_twiddles, strided);
else
@@ -285,28 +247,24 @@ namespace mxntt {
engine.ntt4_2();
if (twiddle_stride) {
if (fast_tw)
engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, s_meta);
engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
else
engine.loadExternalTwiddlesGeneric32(
external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
engine.twiddlesExternal();
}
if (columns_batch_size)
engine.storeGlobalData32ColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.storeGlobalData32(out, data_stride, log_data_stride, strided, s_meta);
engine.storeGlobalData32(out, data_stride, log_data_stride, log_size, strided, s_meta);
}
template <typename E, typename S>
__launch_bounds__(64) __global__ void ntt32dit(
const E* in,
E* in,
E* out,
S* external_twiddles,
S* internal_twiddles,
S* basic_twiddles,
uint32_t log_size,
uint32_t tw_log_size,
uint32_t columns_batch_size,
uint32_t nof_ntt_blocks,
uint32_t data_stride,
uint32_t log_data_stride,
@@ -325,27 +283,19 @@ namespace mxntt {
s_meta.th_stride = 4;
s_meta.ntt_block_size = 32;
s_meta.ntt_block_id = columns_batch_size ? blockIdx.x / ((columns_batch_size + 15) / 16)
: (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
s_meta.ntt_block_id = (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
s_meta.ntt_inp_id = strided ? (threadIdx.x >> 4) : (threadIdx.x & 0x3);
s_meta.batch_id =
columns_batch_size ? (threadIdx.x & 0xf) + ((blockIdx.x % ((columns_batch_size + 15) / 16)) << 4) : 0;
if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
return;
if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
if (fast_tw)
engine.loadBasicTwiddles(basic_twiddles);
else
engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
if (columns_batch_size)
engine.loadGlobalData32ColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.loadGlobalData32(in, data_stride, log_data_stride, strided, s_meta);
engine.loadGlobalData32(in, data_stride, log_data_stride, log_size, strided, s_meta);
if (twiddle_stride) {
if (fast_tw)
engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, s_meta);
engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
else
engine.loadExternalTwiddlesGeneric32(
external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
@@ -361,22 +311,18 @@ namespace mxntt {
engine.SharedData32Rows8(shmem, false, false, strided); // load
engine.twiddlesInternal();
engine.ntt8win();
if (columns_batch_size)
engine.storeGlobalDataColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.storeGlobalData(out, data_stride, log_data_stride, strided, s_meta);
engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
}
template <typename E, typename S>
__launch_bounds__(64) __global__ void ntt16(
const E* in,
E* in,
E* out,
S* external_twiddles,
S* internal_twiddles,
S* basic_twiddles,
uint32_t log_size,
uint32_t tw_log_size,
uint32_t columns_batch_size,
uint32_t nof_ntt_blocks,
uint32_t data_stride,
uint32_t log_data_stride,
@@ -395,26 +341,16 @@ namespace mxntt {
s_meta.th_stride = 2;
s_meta.ntt_block_size = 16;
s_meta.ntt_block_id = columns_batch_size
? blockIdx.x / ((columns_batch_size + 31) / 32)
: (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
s_meta.ntt_block_id = (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
s_meta.ntt_inp_id = strided ? (threadIdx.x >> 5) : (threadIdx.x & 0x1);
s_meta.batch_id =
columns_batch_size ? (threadIdx.x & 0x1f) + ((blockIdx.x % ((columns_batch_size + 31) / 32)) << 5) : 0;
if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
return;
if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
if (fast_tw)
engine.loadBasicTwiddles(basic_twiddles);
else
engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
if (columns_batch_size)
engine.loadGlobalDataColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.loadGlobalData(in, data_stride, log_data_stride, strided, s_meta);
engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
if (fast_tw)
engine.loadInternalTwiddles16(internal_twiddles, strided);
else
@@ -427,28 +363,24 @@ namespace mxntt {
engine.ntt2_4();
if (twiddle_stride) {
if (fast_tw)
engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, s_meta);
engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
else
engine.loadExternalTwiddlesGeneric16(
external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
engine.twiddlesExternal();
}
if (columns_batch_size)
engine.storeGlobalData16ColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.storeGlobalData16(out, data_stride, log_data_stride, strided, s_meta);
engine.storeGlobalData16(out, data_stride, log_data_stride, log_size, strided, s_meta);
}
template <typename E, typename S>
__launch_bounds__(64) __global__ void ntt16dit(
const E* in,
E* in,
E* out,
S* external_twiddles,
S* internal_twiddles,
S* basic_twiddles,
uint32_t log_size,
uint32_t tw_log_size,
uint32_t columns_batch_size,
uint32_t nof_ntt_blocks,
uint32_t data_stride,
uint32_t log_data_stride,
@@ -467,29 +399,19 @@ namespace mxntt {
s_meta.th_stride = 2;
s_meta.ntt_block_size = 16;
s_meta.ntt_block_id = columns_batch_size
? blockIdx.x / ((columns_batch_size + 31) / 32)
: (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
s_meta.ntt_block_id = (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
s_meta.ntt_inp_id = strided ? (threadIdx.x >> 5) : (threadIdx.x & 0x1);
s_meta.batch_id =
columns_batch_size ? (threadIdx.x & 0x1f) + ((blockIdx.x % ((columns_batch_size + 31) / 32)) << 5) : 0;
if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
return;
if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
if (fast_tw)
engine.loadBasicTwiddles(basic_twiddles);
else
engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
if (columns_batch_size)
engine.loadGlobalData16ColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.loadGlobalData16(in, data_stride, log_data_stride, strided, s_meta);
engine.loadGlobalData16(in, data_stride, log_data_stride, log_size, strided, s_meta);
if (twiddle_stride) {
if (fast_tw)
engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, s_meta);
engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
else
engine.loadExternalTwiddlesGeneric16(
external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
@@ -505,17 +427,13 @@ namespace mxntt {
engine.SharedData16Rows8(shmem, false, false, strided); // load
engine.twiddlesInternal();
engine.ntt8win();
if (columns_batch_size)
engine.storeGlobalDataColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
else
engine.storeGlobalData(out, data_stride, log_data_stride, strided, s_meta);
engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
}
template <typename E, typename S>
__global__ void normalize_kernel(E* data, S norm_factor, uint32_t size)
__global__ void normalize_kernel(E* data, S norm_factor)
{
uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= size) return;
data[tid] = data[tid] * norm_factor;
}
@@ -740,7 +658,7 @@ namespace mxntt {
template <typename E, typename S>
cudaError_t large_ntt(
const E* in,
E* in,
E* out,
S* external_twiddles,
S* internal_twiddles,
@@ -748,7 +666,6 @@ namespace mxntt {
uint32_t log_size,
uint32_t tw_log_size,
uint32_t batch_size,
bool columns_batch,
bool inv,
bool normalize,
bool dit,
@@ -762,83 +679,72 @@ namespace mxntt {
}
if (log_size == 4) {
const int NOF_THREADS = columns_batch ? 64 : min(64, 2 * batch_size);
const int NOF_BLOCKS =
columns_batch ? ((batch_size + 31) / 32) : (2 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
const int NOF_THREADS = min(64, 2 * batch_size);
const int NOF_BLOCKS = (2 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
if (dit) {
ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
false, 0, inv, dit, fast_tw);
} else { // dif
ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
false, 0, inv, dit, fast_tw);
}
if (normalize)
normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4), (1 << log_size) * batch_size);
if (normalize) normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4));
return CHK_LAST();
}
if (log_size == 5) {
const int NOF_THREADS = columns_batch ? 64 : min(64, 4 * batch_size);
const int NOF_BLOCKS =
columns_batch ? ((batch_size + 15) / 16) : (4 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
const int NOF_THREADS = min(64, 4 * batch_size);
const int NOF_BLOCKS = (4 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
if (dit) {
ntt32dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
false, 0, inv, dit, fast_tw);
} else { // dif
ntt32<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
false, 0, inv, dit, fast_tw);
}
if (normalize)
normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5), (1 << log_size) * batch_size);
if (normalize) normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5));
return CHK_LAST();
}
if (log_size == 6) {
const int NOF_THREADS = columns_batch ? 64 : min(64, 8 * batch_size);
const int NOF_BLOCKS =
columns_batch ? ((batch_size + 7) / 8) : ((8 * batch_size + NOF_THREADS - 1) / NOF_THREADS);
const int NOF_THREADS = min(64, 8 * batch_size);
const int NOF_BLOCKS = (8 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
ntt64<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
if (normalize)
normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6), (1 << log_size) * batch_size);
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
false, 0, inv, dit, fast_tw);
if (normalize) normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6));
return CHK_LAST();
}
if (log_size == 8) {
const int NOF_THREADS = 64;
const int NOF_BLOCKS =
columns_batch ? ((batch_size + 31) / 32 * 16) : ((32 * batch_size + NOF_THREADS - 1) / NOF_THREADS);
const int NOF_BLOCKS = (32 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
if (dit) {
ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1, 0, 0,
columns_batch, 0, inv, dit, fast_tw);
(1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 16, 4, 16, true, 1,
inv, dit, fast_tw);
(1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
} else { // dif
ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 16, 4, 16, true, 1,
inv, dit, fast_tw);
(1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1, 0, 0,
columns_batch, 0, inv, dit, fast_tw);
(1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
}
if (normalize)
normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8), (1 << log_size) * batch_size);
if (normalize) normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8));
return CHK_LAST();
}
// general case:
uint32_t nof_blocks = (1 << (log_size - 9)) * (columns_batch ? ((batch_size + 31) / 32) * 32 : batch_size);
uint32_t nof_blocks = (1 << (log_size - 9)) * batch_size;
if (dit) {
for (int i = 0; i < 5; i++) {
uint32_t stage_size = fast_tw ? STAGE_SIZES_HOST_FT[log_size][i] : STAGE_SIZES_HOST[log_size][i];
@@ -848,18 +754,18 @@ namespace mxntt {
if (stage_size == 6)
ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 6) * (columns_batch ? 1 : batch_size), 1 << stride_log,
stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
(1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
fast_tw);
else if (stage_size == 5)
ntt32dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 5) * (columns_batch ? 1 : batch_size), 1 << stride_log,
stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
(1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
fast_tw);
else if (stage_size == 4)
ntt16dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1 << stride_log,
stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
(1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
fast_tw);
}
} else { // dif
bool first_run = false, prev_stage = false;
@@ -872,31 +778,30 @@ namespace mxntt {
if (stage_size == 6)
ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 6) * (columns_batch ? 1 : batch_size), 1 << stride_log,
stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
(1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
fast_tw);
else if (stage_size == 5)
ntt32<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 5) * (columns_batch ? 1 : batch_size), 1 << stride_log,
stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
(1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
fast_tw);
else if (stage_size == 4)
ntt16<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1 << stride_log,
stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
(1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
fast_tw);
prev_stage = stage_size;
}
}
if (normalize)
normalize_kernel<<<(1 << (log_size - 8)) * batch_size, 256, 0, cuda_stream>>>(
out, S::inv_log_size(log_size), (1 << log_size) * batch_size);
normalize_kernel<<<(1 << (log_size - 8)) * batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(log_size));
return CHK_LAST();
}
template <typename E, typename S>
cudaError_t mixed_radix_ntt(
const E* d_input,
E* d_input,
E* d_output,
S* external_twiddles,
S* internal_twiddles,
@@ -904,10 +809,9 @@ namespace mxntt {
int ntt_size,
int max_logn,
int batch_size,
bool columns_batch,
bool is_inverse,
bool fast_tw,
ntt::Ordering ordering,
Ordering ordering,
S* arbitrary_coset,
int coset_gen_index,
cudaStream_t cuda_stream)
@@ -925,39 +829,38 @@ namespace mxntt {
eRevType reverse_input = None, reverse_output = None, reverse_coset = None;
bool dit = false;
switch (ordering) {
case ntt::Ordering::kNN:
case Ordering::kNN:
reverse_input = eRevType::NaturalToMixedRev;
dit = true;
break;
case ntt::Ordering::kRN:
case Ordering::kRN:
reverse_input = eRevType::RevToMixedRev;
dit = true;
reverse_coset = is_inverse ? eRevType::None : eRevType::NaturalToRev;
break;
case ntt::Ordering::kNR:
case Ordering::kNR:
reverse_output = eRevType::MixedRevToRev;
reverse_coset = is_inverse ? eRevType::NaturalToRev : eRevType::None;
break;
case ntt::Ordering::kRR:
case Ordering::kRR:
reverse_input = eRevType::RevToMixedRev;
dit = true;
reverse_output = eRevType::NaturalToRev;
reverse_coset = eRevType::NaturalToRev;
break;
case ntt::Ordering::kMN:
case Ordering::kMN:
dit = true;
reverse_coset = is_inverse ? None : eRevType::NaturalToMixedRev;
break;
case ntt::Ordering::kNM:
case Ordering::kNM:
reverse_coset = is_inverse ? eRevType::NaturalToMixedRev : eRevType::None;
break;
}
if (is_on_coset && !is_inverse) {
batch_elementwise_mul_with_reorder_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
d_input, ntt_size, columns_batch, batch_size, columns_batch ? batch_size : 1,
arbitrary_coset ? arbitrary_coset : external_twiddles, arbitrary_coset ? 1 : coset_gen_index, n_twiddles, logn,
reverse_coset, dit, d_output);
batch_elementwise_mul_with_reorder<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
d_input, ntt_size, batch_size, arbitrary_coset ? arbitrary_coset : external_twiddles,
arbitrary_coset ? 1 : coset_gen_index, n_twiddles, logn, reverse_coset, dit, d_output);
d_input = d_output;
}
@@ -966,11 +869,10 @@ namespace mxntt {
const bool is_reverse_in_place = (d_input == d_output);
if (is_reverse_in_place) {
reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
d_output, logn, columns_batch, batch_size, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
} else {
reorder_digits_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
d_input, d_output, logn, columns_batch, batch_size, columns_batch ? batch_size : 1, dit, fast_tw,
reverse_input, is_normalize, S::inv_log_size(logn));
d_input, d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
}
is_normalize = false;
d_input = d_output;
@@ -978,19 +880,18 @@ namespace mxntt {
// inplace ntt
CHK_IF_RETURN(large_ntt(
d_input, d_output, external_twiddles, internal_twiddles, basic_twiddles, logn, max_logn, batch_size,
columns_batch, is_inverse, (is_normalize && reverse_output == eRevType::None), dit, fast_tw, cuda_stream));
d_input, d_output, external_twiddles, internal_twiddles, basic_twiddles, logn, max_logn, batch_size, is_inverse,
(is_normalize && reverse_output == eRevType::None), dit, fast_tw, cuda_stream));
if (reverse_output != eRevType::None) {
reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
d_output, logn, columns_batch, batch_size, dit, fast_tw, reverse_output, is_normalize, S::inv_log_size(logn));
d_output, logn, dit, fast_tw, reverse_output, is_normalize, S::inv_log_size(logn));
}
if (is_on_coset && is_inverse) {
batch_elementwise_mul_with_reorder_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
d_output, ntt_size, columns_batch, batch_size, columns_batch ? batch_size : 1,
arbitrary_coset ? arbitrary_coset : external_twiddles + n_twiddles, arbitrary_coset ? 1 : -coset_gen_index,
n_twiddles, logn, reverse_coset, dit, d_output);
batch_elementwise_mul_with_reorder<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
d_output, ntt_size, batch_size, arbitrary_coset ? arbitrary_coset : external_twiddles + n_twiddles,
arbitrary_coset ? 1 : -coset_gen_index, n_twiddles, logn, reverse_coset, dit, d_output);
}
return CHK_LAST();
@@ -1014,7 +915,7 @@ namespace mxntt {
cudaStream_t& stream);
template cudaError_t mixed_radix_ntt<curve_config::scalar_t, curve_config::scalar_t>(
const curve_config::scalar_t* d_input,
curve_config::scalar_t* d_input,
curve_config::scalar_t* d_output,
curve_config::scalar_t* external_twiddles,
curve_config::scalar_t* internal_twiddles,
@@ -1022,30 +923,11 @@ namespace mxntt {
int ntt_size,
int max_logn,
int batch_size,
bool columns_batch,
bool is_inverse,
bool fast_tw,
ntt::Ordering ordering,
Ordering ordering,
curve_config::scalar_t* arbitrary_coset,
int coset_gen_index,
cudaStream_t cuda_stream);
// TODO: we may reintroduce mixed-radix ECNTT based on upcoming benching PR
// #if defined(ECNTT_DEFINED)
// template cudaError_t mixed_radix_ntt<curve_config::projective_t, curve_config::scalar_t>(
// curve_config::projective_t* d_input,
// curve_config::projective_t* d_output,
// curve_config::scalar_t* external_twiddles,
// curve_config::scalar_t* internal_twiddles,
// curve_config::scalar_t* basic_twiddles,
// int ntt_size,
// int max_logn,
// int batch_size,
// bool columns_batch,
// bool is_inverse,
// bool fast_tw,
// ntt::Ordering ordering,
// curve_config::scalar_t* arbitrary_coset,
// int coset_gen_index,
// cudaStream_t cuda_stream);
// #endif // ECNTT_DEFINED
} // namespace mxntt
} // namespace ntt

View File

@@ -2,32 +2,26 @@
#include <unordered_map>
#include <vector>
#include <type_traits>
#include "curves/curve_config.cuh"
#include "utils/sharedmem.cuh"
#include "utils/utils_kernels.cuh"
#include "utils/utils.h"
#include "appUtils/ntt/ntt_impl.cuh"
#include "appUtils/ntt/ntt.cuh" // for ntt::Ordering
#include <mutex>
#define IS_ECNTT std::is_same_v<E, curve_config::projective_t>
namespace ntt {
namespace {
// TODO: Set MAX THREADS based on GPU arch
const uint32_t MAX_NUM_THREADS = 512; // TODO: hotfix - should be 1024, currently limits shared memory size
const uint32_t MAX_THREADS_BATCH = 512;
const uint32_t MAX_THREADS_BATCH_ECNTT =
256; // TODO: hardcodded - allows (2^18 x 64) ECNTT for sm86, decrease this to allow larger batch or ecntt length
const uint32_t MAX_NUM_THREADS = 512; // TODO: hotfix - should be 1024, currently limits shared memory size
const uint32_t MAX_THREADS_BATCH = 512; // TODO: allows 100% occupancy for scalar NTT for sm_86..sm_89
const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; // TODO: occupancy calculator, hardcoded for sm_86..sm_89
const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * MAX_NUM_THREADS;
template <typename E>
__global__ void reverse_order_kernel(const E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
__global__ void reverse_order_kernel(E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
{
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
if (threadId < n * batch_size) {
@@ -35,14 +29,9 @@ namespace ntt {
int batch_idx = threadId / n;
int idx_reversed = __brev(idx) >> (32 - logn);
if (arr == arr_reversed) { // for in-place (when pointers arr==arr_reversed)
if (idx < idx_reversed) {
E val = arr[batch_idx * n + idx];
arr_reversed[batch_idx * n + idx] = arr[batch_idx * n + idx_reversed];
arr_reversed[batch_idx * n + idx_reversed] = val;
}
} else
arr_reversed[batch_idx * n + idx_reversed] = arr[batch_idx * n + idx];
E val = arr[batch_idx * n + idx];
if (arr == arr_reversed) { __syncthreads(); } // for in-place (when pointers arr==arr_reversed)
arr_reversed[batch_idx * n + idx_reversed] = val;
}
}
@@ -57,8 +46,7 @@ namespace ntt {
* @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
*/
template <typename E>
void reverse_order_batch(
const E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
void reverse_order_batch(E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
{
int number_of_threads = MAX_THREADS_BATCH;
int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
@@ -75,7 +63,7 @@ namespace ntt {
* @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
*/
template <typename E>
void reverse_order(const E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
void reverse_order(E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
{
reverse_order_batch(arr_in, n, logn, 1, stream, arr_out);
}
@@ -93,7 +81,7 @@ namespace ntt {
*/
template <typename E, typename S>
__global__ void ntt_template_kernel_shared_rev(
const E* __restrict__ arr_in,
E* __restrict__ arr_in,
int n,
const S* __restrict__ r_twiddles,
int n_twiddles,
@@ -114,7 +102,7 @@ namespace ntt {
uint32_t l = threadIdx.x;
if (l < loop_limit) {
UNROLL
#pragma unroll
for (; ss < logn; ss++) {
int s = logn - ss - 1;
bool is_beginning = ss == 0;
@@ -165,7 +153,7 @@ namespace ntt {
*/
template <typename E, typename S>
__global__ void ntt_template_kernel_shared(
const E* __restrict__ arr_in,
E* __restrict__ arr_in,
int n,
const S* __restrict__ r_twiddles,
int n_twiddles,
@@ -186,7 +174,7 @@ namespace ntt {
uint32_t l = threadIdx.x;
if (l < loop_limit) {
UNROLL
#pragma unroll
for (; s < logn; s++) // TODO: this loop also can be unrolled
{
uint32_t ntw_i = task % chunks;
@@ -233,7 +221,7 @@ namespace ntt {
*/
template <typename E, typename S>
__global__ void
ntt_template_kernel(const E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
ntt_template_kernel(E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
{
int task = blockIdx.x;
int chunks = n / (blockDim.x * 2);
@@ -285,7 +273,7 @@ namespace ntt {
*/
template <typename E, typename S>
cudaError_t ntt_inplace_batch_template(
const E* d_input,
E* d_input,
int n,
S* d_twiddles,
int n_twiddles,
@@ -302,8 +290,7 @@ namespace ntt {
bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE;
const int log2_shmem_elems = is_shared_mem_enabled ? int(log(int(MAX_SHARED_MEM / sizeof(E))) / log(2)) : logn;
int max_threads_batch = IS_ECNTT ? MAX_THREADS_BATCH_ECNTT : MAX_THREADS_BATCH;
int num_threads = max(min(min(n / 2, max_threads_batch), 1 << (log2_shmem_elems - 1)), 1);
int num_threads = max(min(min(n / 2, MAX_THREADS_BATCH), 1 << (log2_shmem_elems - 1)), 1);
const int chunks = max(int((n / 2) / num_threads), 1);
const int total_tasks = batch_size * chunks;
int num_blocks = total_tasks;
@@ -401,11 +388,10 @@ namespace ntt {
template <typename U>
friend cudaError_t InitDomain<U>(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);
template <typename U>
friend cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);
cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);
template <typename U, typename E>
friend cudaError_t NTT<U, E>(const E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
friend cudaError_t NTT<U, E>(E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
};
template <typename S>
@@ -450,7 +436,7 @@ namespace ntt {
// Note: radix-2 INTT needs ONE in last element (in addition to first element), therefore have n+1 elements
// Managed allocation allows host to read the elements (logn) without copying all (n) TFs back to host
CHK_IF_RETURN(cudaMallocManaged(&domain.twiddles, (domain.max_size + 1) * sizeof(S)));
CHK_IF_RETURN(mxntt::generate_external_twiddles_generic(
CHK_IF_RETURN(generate_external_twiddles_generic(
primitive_root, domain.twiddles, domain.internal_twiddles, domain.basic_twiddles, domain.max_log_size,
ctx.stream));
@@ -460,7 +446,7 @@ namespace ntt {
CHK_IF_RETURN(cudaMallocAsync(&domain.fast_external_twiddles_inv, domain.max_size * sizeof(S) * 2, ctx.stream));
// fast-twiddles forward NTT
CHK_IF_RETURN(mxntt::generate_external_twiddles_fast_twiddles_mode(
CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
primitive_root, domain.fast_external_twiddles, domain.fast_internal_twiddles, domain.fast_basic_twiddles,
domain.max_log_size, ctx.stream));
@@ -468,7 +454,7 @@ namespace ntt {
S primitive_root_inv;
CHK_IF_RETURN(cudaMemcpyAsync(
&primitive_root_inv, &domain.twiddles[domain.max_size - 1], sizeof(S), cudaMemcpyDeviceToHost, ctx.stream));
CHK_IF_RETURN(mxntt::generate_external_twiddles_fast_twiddles_mode(
CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
primitive_root_inv, domain.fast_external_twiddles_inv, domain.fast_internal_twiddles_inv,
domain.fast_basic_twiddles_inv, domain.max_log_size, ctx.stream));
}
@@ -496,51 +482,46 @@ namespace ntt {
}
template <typename S>
cudaError_t ReleaseDomain(device_context::DeviceContext& ctx)
cudaError_t Domain<S>::ReleaseDomain(device_context::DeviceContext& ctx)
{
CHK_INIT_IF_RETURN();
Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
max_size = 0;
max_log_size = 0;
cudaFreeAsync(twiddles, ctx.stream);
twiddles = nullptr;
cudaFreeAsync(internal_twiddles, ctx.stream);
internal_twiddles = nullptr;
cudaFreeAsync(basic_twiddles, ctx.stream);
basic_twiddles = nullptr;
coset_index.clear();
domain.max_size = 0;
domain.max_log_size = 0;
domain.twiddles = nullptr; // allocated via cudaMallocManaged(...) so released without calling cudaFree(...)
CHK_IF_RETURN(cudaFreeAsync(domain.internal_twiddles, ctx.stream));
domain.internal_twiddles = nullptr;
CHK_IF_RETURN(cudaFreeAsync(domain.basic_twiddles, ctx.stream));
domain.basic_twiddles = nullptr;
domain.coset_index.clear();
CHK_IF_RETURN(cudaFreeAsync(domain.fast_external_twiddles, ctx.stream));
domain.fast_external_twiddles = nullptr;
CHK_IF_RETURN(cudaFreeAsync(domain.fast_internal_twiddles, ctx.stream));
domain.fast_internal_twiddles = nullptr;
CHK_IF_RETURN(cudaFreeAsync(domain.fast_basic_twiddles, ctx.stream));
domain.fast_basic_twiddles = nullptr;
CHK_IF_RETURN(cudaFreeAsync(domain.fast_external_twiddles_inv, ctx.stream));
domain.fast_external_twiddles_inv = nullptr;
CHK_IF_RETURN(cudaFreeAsync(domain.fast_internal_twiddles_inv, ctx.stream));
domain.fast_internal_twiddles_inv = nullptr;
CHK_IF_RETURN(cudaFreeAsync(domain.fast_basic_twiddles_inv, ctx.stream));
domain.fast_basic_twiddles_inv = nullptr;
domain.initialized = false;
cudaFreeAsync(fast_external_twiddles, ctx.stream);
fast_external_twiddles = nullptr;
cudaFreeAsync(fast_internal_twiddles, ctx.stream);
fast_internal_twiddles = nullptr;
cudaFreeAsync(fast_basic_twiddles, ctx.stream);
fast_basic_twiddles = nullptr;
cudaFreeAsync(fast_external_twiddles_inv, ctx.stream);
fast_external_twiddles_inv = nullptr;
cudaFreeAsync(fast_internal_twiddles_inv, ctx.stream);
fast_internal_twiddles_inv = nullptr;
cudaFreeAsync(fast_basic_twiddles_inv, ctx.stream);
fast_basic_twiddles_inv = nullptr;
return CHK_LAST();
}
template <typename S>
static bool is_choosing_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
static bool is_choose_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
{
const bool is_mixed_radix_alg_supported = (logn > 3 && logn != 7);
if (!is_mixed_radix_alg_supported && config.columns_batch)
throw IcicleError(IcicleError_t::InvalidArgument, "columns batch is not supported for given NTT size");
const bool is_user_selected_radix2_alg = config.ntt_algorithm == NttAlgorithm::Radix2;
const bool is_force_radix2 = !is_mixed_radix_alg_supported || is_user_selected_radix2_alg;
if (is_force_radix2) return true;
const bool is_user_selected_mixed_radix_alg = config.ntt_algorithm == NttAlgorithm::MixedRadix;
if (is_user_selected_mixed_radix_alg) return false;
if (config.columns_batch) return false; // radix2 does not currently support columns batch mode.
// Heuristic to automatically select an algorithm
// Note that generally the decision depends on {logn, batch, ordering, inverse, coset, in-place, coeff-field} and
@@ -556,7 +537,7 @@ namespace ntt {
template <typename S, typename E>
cudaError_t radix2_ntt(
const E* d_input,
E* d_input,
E* d_output,
S* twiddles,
int ntt_size,
@@ -602,7 +583,7 @@ namespace ntt {
}
template <typename S, typename E>
cudaError_t NTT(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
{
CHK_INIT_IF_RETURN();
@@ -629,22 +610,18 @@ namespace ntt {
bool are_inputs_on_device = config.are_inputs_on_device;
bool are_outputs_on_device = config.are_outputs_on_device;
const E* d_input;
E* d_allocated_input = nullptr;
E* d_input;
if (are_inputs_on_device) {
d_input = input;
} else {
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, input_size_bytes, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_allocated_input, input, input_size_bytes, cudaMemcpyHostToDevice, stream));
d_input = d_allocated_input;
CHK_IF_RETURN(cudaMallocAsync(&d_input, input_size_bytes, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, input_size_bytes, cudaMemcpyHostToDevice, stream));
}
E* d_output;
E* d_allocated_output = nullptr;
if (are_outputs_on_device) {
d_output = output;
} else {
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_output, input_size_bytes, stream));
d_output = d_allocated_output;
CHK_IF_RETURN(cudaMallocAsync(&d_output, input_size_bytes, stream));
}
S* coset = nullptr;
@@ -664,42 +641,37 @@ namespace ntt {
h_coset.clear();
}
const bool is_radix2_algorithm = is_choose_radix2_algorithm(logn, batch_size, config);
const bool is_inverse = dir == NTTDir::kInverse;
if constexpr (IS_ECNTT) {
if (is_radix2_algorithm) {
CHK_IF_RETURN(ntt::radix2_ntt(
d_input, d_output, domain.twiddles, size, domain.max_size, batch_size, is_inverse, config.ordering, coset,
coset_index, stream));
} else {
const bool is_radix2_algorithm = is_choosing_radix2_algorithm(logn, batch_size, config);
if (is_radix2_algorithm) {
CHK_IF_RETURN(ntt::radix2_ntt(
d_input, d_output, domain.twiddles, size, domain.max_size, batch_size, is_inverse, config.ordering, coset,
coset_index, stream));
} else {
const bool is_on_coset = (coset_index != 0) || coset;
const bool is_fast_twiddles_enabled = (domain.fast_external_twiddles != nullptr) && !is_on_coset;
S* twiddles = is_fast_twiddles_enabled
? (is_inverse ? domain.fast_external_twiddles_inv : domain.fast_external_twiddles)
: domain.twiddles;
S* internal_twiddles = is_fast_twiddles_enabled
? (is_inverse ? domain.fast_internal_twiddles_inv : domain.fast_internal_twiddles)
: domain.internal_twiddles;
S* basic_twiddles = is_fast_twiddles_enabled
? (is_inverse ? domain.fast_basic_twiddles_inv : domain.fast_basic_twiddles)
: domain.basic_twiddles;
CHK_IF_RETURN(mxntt::mixed_radix_ntt(
d_input, d_output, twiddles, internal_twiddles, basic_twiddles, size, domain.max_log_size, batch_size,
config.columns_batch, is_inverse, is_fast_twiddles_enabled, config.ordering, coset, coset_index, stream));
}
const bool is_on_coset = (coset_index != 0) || coset;
const bool is_fast_twiddles_enabled = (domain.fast_external_twiddles != nullptr) && !is_on_coset;
S* twiddles = is_fast_twiddles_enabled
? (is_inverse ? domain.fast_external_twiddles_inv : domain.fast_external_twiddles)
: domain.twiddles;
S* internal_twiddles = is_fast_twiddles_enabled
? (is_inverse ? domain.fast_internal_twiddles_inv : domain.fast_internal_twiddles)
: domain.internal_twiddles;
S* basic_twiddles = is_fast_twiddles_enabled
? (is_inverse ? domain.fast_basic_twiddles_inv : domain.fast_basic_twiddles)
: domain.basic_twiddles;
CHK_IF_RETURN(ntt::mixed_radix_ntt(
d_input, d_output, twiddles, internal_twiddles, basic_twiddles, size, domain.max_log_size, batch_size,
is_inverse, is_fast_twiddles_enabled, config.ordering, coset, coset_index, stream));
}
if (!are_outputs_on_device)
CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, input_size_bytes, cudaMemcpyDeviceToHost, stream));
if (coset) CHK_IF_RETURN(cudaFreeAsync(coset, stream));
if (d_allocated_input) CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, stream));
if (d_allocated_output) CHK_IF_RETURN(cudaFreeAsync(d_allocated_output, stream));
if (!are_inputs_on_device) CHK_IF_RETURN(cudaFreeAsync(d_input, stream));
if (!are_outputs_on_device) CHK_IF_RETURN(cudaFreeAsync(d_output, stream));
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
return CHK_LAST();
@@ -713,7 +685,6 @@ namespace ntt {
ctx, // ctx
S::one(), // coset_gen
1, // batch_size
false, // columns_batch
Ordering::kNN, // ordering
false, // are_inputs_on_device
false, // are_outputs_on_device
@@ -741,7 +712,7 @@ namespace ntt {
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, NTTCuda)(
const curve_config::scalar_t* input,
curve_config::scalar_t* input,
int size,
NTTDir dir,
NTTConfig<curve_config::scalar_t>& config,
@@ -750,18 +721,8 @@ namespace ntt {
return NTT<curve_config::scalar_t, curve_config::scalar_t>(input, size, dir, config, output);
}
/**
* Extern "C" version of [ReleaseDomain](@ref ReleaseDomain) function with the following values of template parameters
* (where the curve is given by `-DCURVE` env variable during build):
* - `S` is the [scalar field](@ref scalar_t) of the curve;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ReleaseDomain)(device_context::DeviceContext& ctx)
{
return ReleaseDomain<curve_config::scalar_t>(ctx);
}
#if defined(ECNTT_DEFINED)
/**
* Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
* (where the curve is given by `-DCURVE` env variable during build):
@@ -770,7 +731,7 @@ namespace ntt {
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, ECNTTCuda)(
const curve_config::projective_t* input,
curve_config::projective_t* input,
int size,
NTTDir dir,
NTTConfig<curve_config::scalar_t>& config,

View File

@@ -40,22 +40,6 @@ namespace ntt {
template <typename S>
cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);
/**
* Releases and deallocates resources associated with the domain initialized for performing NTTs.
* This function should be called to clean up resources once they are no longer needed.
* It's important to note that after calling this function, any operation that relies on the released domain will
* fail unless InitDomain is called again to reinitialize the resources. Therefore, ensure that ReleaseDomain is
* only called when the operations requiring the NTT domain are completely finished and the domain is no longer
* needed.
* Also note that it is releasing the domain associated to the specific device.
* @param ctx Details related to the device context such as its id and stream id.
* @return `cudaSuccess` if the resource release was successful, indicating that the domain and its associated
* resources have been properly deallocated. Returns an error code otherwise, indicating failure to release
* the resources. The error code can be used to diagnose the problem.
* */
template <typename S>
cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);
/**
* @enum NTTDir
* Whether to perform normal forward NTT, or inverse NTT (iNTT). Mathematically, forward NTT computes polynomial
@@ -111,8 +95,6 @@ namespace ntt {
S coset_gen; /**< Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()`
* (corresponding to no coset being used). */
int batch_size; /**< The number of NTTs to compute. Default value: 1. */
bool columns_batch; /**< True if the batches are the columns of an input matrix
(they are strided in memory with a stride of ntt size) Default value: false. */
Ordering ordering; /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value:
* `Ordering::kNN`. */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
@@ -150,7 +132,7 @@ namespace ntt {
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
template <typename S, typename E>
cudaError_t NTT(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output);
cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output);
} // namespace ntt

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#include "appUtils/ntt/ntt.cuh" // for enum Ordering
namespace mxntt {
namespace ntt {
template <typename S>
cudaError_t generate_external_twiddles_generic(
@@ -27,7 +27,7 @@ namespace mxntt {
template <typename E, typename S>
cudaError_t mixed_radix_ntt(
const E* d_input,
E* d_input,
E* d_output,
S* external_twiddles,
S* internal_twiddles,
@@ -35,13 +35,12 @@ namespace mxntt {
int ntt_size,
int max_logn,
int batch_size,
bool columns_batch,
bool is_inverse,
bool fast_tw,
ntt::Ordering ordering,
Ordering ordering,
S* arbitrary_coset,
int coset_gen_index,
cudaStream_t cuda_stream);
} // namespace mxntt
} // namespace ntt
#endif //_NTT_IMPL_H

View File

@@ -12,14 +12,8 @@
#include "ntt/ntt_impl.cuh"
#include <memory>
#ifdef ECNTT_DEFINED
typedef curve_config::scalar_t test_scalar;
typedef curve_config::projective_t test_data;
#else
typedef curve_config::scalar_t test_scalar;
typedef curve_config::scalar_t test_data;
#endif
#include "kernel_ntt.cu"
void random_samples(test_data* res, uint32_t count)
@@ -35,13 +29,6 @@ void incremental_values(test_scalar* res, uint32_t count)
}
}
__global__ void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= row_size * column_size) return;
out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
}
int main(int argc, char** argv)
{
cudaEvent_t icicle_start, icicle_stop, new_start, new_stop;
@@ -50,12 +37,11 @@ int main(int argc, char** argv)
int NTT_LOG_SIZE = (argc > 1) ? atoi(argv[1]) : 19;
int NTT_SIZE = 1 << NTT_LOG_SIZE;
bool INPLACE = (argc > 2) ? atoi(argv[2]) : false;
int INV = (argc > 3) ? atoi(argv[3]) : false;
int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 150;
bool COLUMNS_BATCH = (argc > 5) ? atoi(argv[5]) : false;
int COSET_IDX = (argc > 6) ? atoi(argv[6]) : 2;
const ntt::Ordering ordering = (argc > 7) ? ntt::Ordering(atoi(argv[7])) : ntt::Ordering::kNN;
bool FAST_TW = (argc > 8) ? atoi(argv[8]) : true;
int INV = (argc > 3) ? atoi(argv[3]) : true;
int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 1;
int COSET_IDX = (argc > 5) ? atoi(argv[5]) : 0;
const ntt::Ordering ordering = (argc > 6) ? ntt::Ordering(atoi(argv[6])) : ntt::Ordering::kNN;
bool FAST_TW = (argc > 7) ? atoi(argv[7]) : true;
// Note: NM, MN are not expected to be equal when comparing mixed-radix and radix-2 NTTs
const char* ordering_str = ordering == ntt::Ordering::kNN ? "NN"
@@ -66,8 +52,8 @@ int main(int argc, char** argv)
: "MN";
printf(
"running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, columns_batch=%d coset-idx=%d, ordering=%s, fast_tw=%d\n",
NTT_LOG_SIZE, INPLACE, INV, BATCH_SIZE, COLUMNS_BATCH, COSET_IDX, ordering_str, FAST_TW);
"running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, coset-idx=%d, ordering=%s, fast_tw=%d\n", NTT_LOG_SIZE,
INPLACE, INV, BATCH_SIZE, COSET_IDX, ordering_str, FAST_TW);
CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context (warmup)
@@ -77,7 +63,6 @@ int main(int argc, char** argv)
ntt_config.are_inputs_on_device = true;
ntt_config.are_outputs_on_device = true;
ntt_config.batch_size = BATCH_SIZE;
ntt_config.columns_batch = COLUMNS_BATCH;
CHK_IF_RETURN(cudaEventCreate(&icicle_start));
CHK_IF_RETURN(cudaEventCreate(&icicle_stop));
@@ -98,9 +83,7 @@ int main(int argc, char** argv)
// gpu allocation
test_data *GpuScalars, *GpuOutputOld, *GpuOutputNew;
test_data* GpuScalarsTransposed;
CHK_IF_RETURN(cudaMalloc(&GpuScalars, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
CHK_IF_RETURN(cudaMalloc(&GpuScalarsTransposed, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
CHK_IF_RETURN(cudaMalloc(&GpuOutputOld, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
CHK_IF_RETURN(cudaMalloc(&GpuOutputNew, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
@@ -110,16 +93,10 @@ int main(int argc, char** argv)
CHK_IF_RETURN(
cudaMemcpy(GpuScalars, CpuScalars.get(), NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyHostToDevice));
if (COLUMNS_BATCH) {
transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
GpuScalars, GpuScalarsTransposed, NTT_SIZE, BATCH_SIZE);
}
// inplace
if (INPLACE) {
CHK_IF_RETURN(cudaMemcpy(
GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
cudaMemcpyDeviceToDevice));
CHK_IF_RETURN(
cudaMemcpy(GpuOutputNew, GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
}
for (int coset_idx = 0; coset_idx < COSET_IDX; ++coset_idx) {
@@ -132,14 +109,13 @@ int main(int argc, char** argv)
ntt_config.ntt_algorithm = ntt::NttAlgorithm::MixedRadix;
for (size_t i = 0; i < iterations; i++) {
CHK_IF_RETURN(ntt::NTT(
INPLACE ? GpuOutputNew
: COLUMNS_BATCH ? GpuScalarsTransposed
: GpuScalars,
NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputNew));
INPLACE ? GpuOutputNew : GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config,
GpuOutputNew));
}
CHK_IF_RETURN(cudaEventRecord(new_stop, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
CHK_IF_RETURN(cudaEventElapsedTime(&new_time, new_start, new_stop));
if (is_print) { fprintf(stderr, "cuda err %d\n", cudaGetLastError()); }
// OLD
CHK_IF_RETURN(cudaEventRecord(icicle_start, ntt_config.ctx.stream));
@@ -151,6 +127,7 @@ int main(int argc, char** argv)
CHK_IF_RETURN(cudaEventRecord(icicle_stop, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
CHK_IF_RETURN(cudaEventElapsedTime(&icicle_time, icicle_start, icicle_stop));
if (is_print) { fprintf(stderr, "cuda err %d\n", cudaGetLastError()); }
if (is_print) {
printf("Old Runtime=%0.3f MS\n", icicle_time / iterations);
@@ -163,19 +140,11 @@ int main(int argc, char** argv)
CHK_IF_RETURN(benchmark(false /*=print*/, 1)); // warmup
int count = INPLACE ? 1 : 10;
if (INPLACE) {
CHK_IF_RETURN(cudaMemcpy(
GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
cudaMemcpyDeviceToDevice));
CHK_IF_RETURN(
cudaMemcpy(GpuOutputNew, GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
}
CHK_IF_RETURN(benchmark(true /*=print*/, count));
if (COLUMNS_BATCH) {
transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
GpuOutputNew, GpuScalarsTransposed, BATCH_SIZE, NTT_SIZE);
CHK_IF_RETURN(cudaMemcpy(
GpuOutputNew, GpuScalarsTransposed, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
}
// verify
CHK_IF_RETURN(
cudaMemcpy(CpuOutputNew.get(), GpuOutputNew, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
@@ -184,11 +153,10 @@ int main(int argc, char** argv)
bool success = true;
for (int i = 0; i < NTT_SIZE * BATCH_SIZE; i++) {
// if (i%64==0) printf("\n");
if (CpuOutputNew[i] != CpuOutputOld[i]) {
success = false;
// std::cout << i << " ref " << CpuOutputOld[i] << " != " << CpuOutputNew[i] << std::endl;
// break;
break;
} else {
// std::cout << i << " ref " << CpuOutputOld[i] << " == " << CpuOutputNew[i] << std::endl;
// break;
@@ -201,7 +169,5 @@ int main(int argc, char** argv)
CHK_IF_RETURN(cudaFree(GpuOutputOld));
CHK_IF_RETURN(cudaFree(GpuOutputNew));
ntt::ReleaseDomain<test_scalar>(ntt_config.ctx);
return CHK_LAST();
}

View File

@@ -9,7 +9,6 @@
struct stage_metadata {
uint32_t th_stride;
uint32_t ntt_block_size;
uint32_t batch_id;
uint32_t ntt_block_id;
uint32_t ntt_inp_id;
};
@@ -51,113 +50,116 @@ public:
S WI[7];
S WE[8];
DEVICE_INLINE void loadBasicTwiddles(S* basic_twiddles)
__device__ __forceinline__ void loadBasicTwiddles(S* basic_twiddles)
{
UNROLL
#pragma unroll
for (int i = 0; i < 3; i++) {
WB[i] = basic_twiddles[i];
}
}
DEVICE_INLINE void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
__device__ __forceinline__ void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
{
UNROLL
#pragma unroll
for (int i = 0; i < 3; i++) {
WB[i] = basic_twiddles[inv ? i + 3 : i];
}
}
DEVICE_INLINE void loadInternalTwiddles64(S* data, bool stride)
__device__ __forceinline__ void loadInternalTwiddles64(S* data, bool stride)
{
UNROLL
#pragma unroll
for (int i = 0; i < 7; i++) {
WI[i] = data[((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1)];
}
}
DEVICE_INLINE void loadInternalTwiddles32(S* data, bool stride)
__device__ __forceinline__ void loadInternalTwiddles32(S* data, bool stride)
{
UNROLL
#pragma unroll
for (int i = 0; i < 7; i++) {
WI[i] = data[2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1)];
}
}
DEVICE_INLINE void loadInternalTwiddles16(S* data, bool stride)
__device__ __forceinline__ void loadInternalTwiddles16(S* data, bool stride)
{
UNROLL
#pragma unroll
for (int i = 0; i < 7; i++) {
WI[i] = data[4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1)];
}
}
DEVICE_INLINE void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
__device__ __forceinline__ void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
{
UNROLL
#pragma unroll
for (int i = 0; i < 7; i++) {
uint32_t exp = ((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1);
WI[i] = data[(inv && exp) ? 64 - exp : exp]; // if exp = 0 we also take exp and not 64-exp
}
}
DEVICE_INLINE void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
__device__ __forceinline__ void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
{
UNROLL
#pragma unroll
for (int i = 0; i < 7; i++) {
uint32_t exp = 2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1);
WI[i] = data[(inv && exp) ? 64 - exp : exp];
}
}
DEVICE_INLINE void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
__device__ __forceinline__ void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
{
UNROLL
#pragma unroll
for (int i = 0; i < 7; i++) {
uint32_t exp = 4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1);
WI[i] = data[(inv && exp) ? 64 - exp : exp];
}
}
DEVICE_INLINE void loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
__device__ __forceinline__ void
loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
{
data += tw_order * s_meta.ntt_inp_id + (s_meta.ntt_block_id & (tw_order - 1));
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
WE[i] = data[8 * i * tw_order + (1 << tw_log_order + 6) - 1];
}
}
DEVICE_INLINE void loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
__device__ __forceinline__ void
loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
{
data += tw_order * s_meta.ntt_inp_id * 2 + (s_meta.ntt_block_id & (tw_order - 1));
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 2; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 4; i++) {
WE[4 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 5) - 1];
}
}
}
DEVICE_INLINE void loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
__device__ __forceinline__ void
loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
{
data += tw_order * s_meta.ntt_inp_id * 4 + (s_meta.ntt_block_id & (tw_order - 1));
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 4; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 2; i++) {
WE[2 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 4) - 1];
}
}
}
DEVICE_INLINE void loadExternalTwiddlesGeneric64(
__device__ __forceinline__ void loadExternalTwiddlesGeneric64(
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
{
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
uint32_t exp = (s_meta.ntt_inp_id + 8 * i) * (s_meta.ntt_block_id & (tw_order - 1))
<< (tw_log_size - tw_log_order - 6);
@@ -165,12 +167,12 @@ public:
}
}
DEVICE_INLINE void loadExternalTwiddlesGeneric32(
__device__ __forceinline__ void loadExternalTwiddlesGeneric32(
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
{
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 2; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 4; i++) {
uint32_t exp = (s_meta.ntt_inp_id * 2 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
<< (tw_log_size - tw_log_order - 5);
@@ -179,12 +181,12 @@ public:
}
}
DEVICE_INLINE void loadExternalTwiddlesGeneric16(
__device__ __forceinline__ void loadExternalTwiddlesGeneric16(
S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
{
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 4; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 2; i++) {
uint32_t exp = (s_meta.ntt_inp_id * 4 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
<< (tw_log_size - tw_log_order - 4);
@@ -193,8 +195,8 @@ public:
}
}
DEVICE_INLINE void
loadGlobalData(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
__device__ __forceinline__ void loadGlobalData(
E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
@@ -203,28 +205,14 @@ public:
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
}
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
X[i] = data[s_meta.th_stride * i * data_stride];
}
}
DEVICE_INLINE void loadGlobalDataColumnBatch(
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
X[i] = data[s_meta.th_stride * i * data_stride * batch_size];
}
}
DEVICE_INLINE void
storeGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
__device__ __forceinline__ void storeGlobalData(
E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
@@ -233,28 +221,14 @@ public:
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
}
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
data[s_meta.th_stride * i * data_stride] = X[i];
}
}
DEVICE_INLINE void storeGlobalDataColumnBatch(
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t i = 0; i < 8; i++) {
data[s_meta.th_stride * i * data_stride * batch_size] = X[i];
}
}
DEVICE_INLINE void
loadGlobalData32(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
__device__ __forceinline__ void loadGlobalData32(
E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
@@ -263,34 +237,17 @@ public:
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
}
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 2; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 4; i++) {
X[4 * j + i] = data[(8 * i + j) * data_stride];
}
}
}
DEVICE_INLINE void loadGlobalData32ColumnBatch(
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
X[4 * j + i] = data[(8 * i + j) * data_stride * batch_size];
}
}
}
DEVICE_INLINE void
storeGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
__device__ __forceinline__ void storeGlobalData32(
E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
@@ -299,34 +256,17 @@ public:
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
}
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 2; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 4; i++) {
data[(8 * i + j) * data_stride] = X[4 * j + i];
}
}
}
DEVICE_INLINE void storeGlobalData32ColumnBatch(
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 2; j++) {
UNROLL
for (uint32_t i = 0; i < 4; i++) {
data[(8 * i + j) * data_stride * batch_size] = X[4 * j + i];
}
}
}
DEVICE_INLINE void
loadGlobalData16(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
__device__ __forceinline__ void loadGlobalData16(
E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
@@ -335,34 +275,17 @@ public:
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
}
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 4; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 2; i++) {
X[2 * j + i] = data[(8 * i + j) * data_stride];
}
}
}
DEVICE_INLINE void loadGlobalData16ColumnBatch(
const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
X[2 * j + i] = data[(8 * i + j) * data_stride * batch_size];
}
}
}
DEVICE_INLINE void
storeGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
__device__ __forceinline__ void storeGlobalData16(
E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
{
if (strided) {
data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
@@ -371,49 +294,32 @@ public:
data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
}
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 4; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 2; i++) {
data[(8 * i + j) * data_stride] = X[2 * j + i];
}
}
}
DEVICE_INLINE void storeGlobalData16ColumnBatch(
E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
__device__ __forceinline__ void ntt4_2()
{
data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
(s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
batch_size +
s_meta.batch_id;
UNROLL
for (uint32_t j = 0; j < 4; j++) {
UNROLL
for (uint32_t i = 0; i < 2; i++) {
data[(8 * i + j) * data_stride * batch_size] = X[2 * j + i];
}
}
}
DEVICE_INLINE void ntt4_2()
{
UNROLL
#pragma unroll
for (int i = 0; i < 2; i++) {
ntt4(X[4 * i], X[4 * i + 1], X[4 * i + 2], X[4 * i + 3]);
}
}
DEVICE_INLINE void ntt2_4()
__device__ __forceinline__ void ntt2_4()
{
UNROLL
#pragma unroll
for (int i = 0; i < 4; i++) {
ntt2(X[2 * i], X[2 * i + 1]);
}
}
DEVICE_INLINE void ntt2(E& X0, E& X1)
__device__ __forceinline__ void ntt2(E& X0, E& X1)
{
E T;
@@ -422,7 +328,7 @@ public:
X0 = T;
}
DEVICE_INLINE void ntt4(E& X0, E& X1, E& X2, E& X3)
__device__ __forceinline__ void ntt4(E& X0, E& X1, E& X2, E& X3)
{
E T;
@@ -440,7 +346,7 @@ public:
}
// rbo version
DEVICE_INLINE void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
__device__ __forceinline__ void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
{
E T;
@@ -457,7 +363,7 @@ public:
X3 = T - X3;
}
DEVICE_INLINE void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
__device__ __forceinline__ void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
{
E T;
@@ -497,7 +403,7 @@ public:
X4 = X4 - T;
}
DEVICE_INLINE void ntt8win()
__device__ __forceinline__ void ntt8win()
{
E T;
@@ -539,12 +445,12 @@ public:
X[4] = X[4] - T;
}
DEVICE_INLINE void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
uint32_t column_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 64 + i * 8 + column_id] = X[i];
@@ -554,12 +460,12 @@ public:
}
}
DEVICE_INLINE void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
uint32_t row_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 64 + row_id * 8 + i] = X[i];
@@ -569,12 +475,12 @@ public:
}
}
DEVICE_INLINE void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t column_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 32 + i * 4 + column_id] = X[i];
@@ -584,12 +490,12 @@ public:
}
}
DEVICE_INLINE void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t row_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 32 + row_id * 8 + i] = X[i];
@@ -599,14 +505,14 @@ public:
}
}
DEVICE_INLINE void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t column_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 2; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 4; i++) {
if (store) {
shmem[ntt_id * 32 + i * 8 + column_id + j] = X[4 * j + i];
@@ -617,14 +523,14 @@ public:
}
}
DEVICE_INLINE void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
uint32_t row_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 2; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 4; i++) {
if (store) {
shmem[ntt_id * 32 + row_id * 4 + 4 * j + i] = X[4 * j + i];
@@ -635,12 +541,12 @@ public:
}
}
DEVICE_INLINE void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t column_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 16 + i * 2 + column_id] = X[i];
@@ -650,12 +556,12 @@ public:
}
}
DEVICE_INLINE void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t row_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 8; i++) {
if (store) {
shmem[ntt_id * 16 + row_id * 8 + i] = X[i];
@@ -665,14 +571,14 @@ public:
}
}
DEVICE_INLINE void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t column_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 4; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 2; i++) {
if (store) {
shmem[ntt_id * 16 + i * 8 + column_id + j] = X[2 * j + i];
@@ -683,14 +589,14 @@ public:
}
}
DEVICE_INLINE void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
__device__ __forceinline__ void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
{
uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
uint32_t row_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
UNROLL
#pragma unroll
for (uint32_t j = 0; j < 4; j++) {
UNROLL
#pragma unroll
for (uint32_t i = 0; i < 2; i++) {
if (store) {
shmem[ntt_id * 16 + row_id * 2 + 2 * j + i] = X[2 * j + i];
@@ -701,17 +607,17 @@ public:
}
}
DEVICE_INLINE void twiddlesInternal()
__device__ __forceinline__ void twiddlesInternal()
{
UNROLL
#pragma unroll
for (int i = 1; i < 8; i++) {
X[i] = X[i] * WI[i - 1];
}
}
DEVICE_INLINE void twiddlesExternal()
__device__ __forceinline__ void twiddlesExternal()
{
UNROLL
#pragma unroll
for (int i = 0; i < 8; i++) {
X[i] = X[i] * WE[i];
}

View File

@@ -1,39 +0,0 @@
#pragma once
#ifndef CURVE_POSEIDON_H
#define CURVE_POSEIDON_H
namespace poseidon_constants_curve {
/**
* This inner namespace contains optimized constants for running Poseidon.
* These constants were generated using an algorithm defined at
* https://spec.filecoin.io/algorithms/crypto/poseidon/
* The number in the name corresponds to the arity of hash function
* Each array contains:
* RoundConstants | MDSMatrix | Non-sparse matrix | Sparse matrices
*/
int partial_rounds_2 = 0;
int partial_rounds_4 = 0;
int partial_rounds_8 = 0;
int partial_rounds_11 = 0;
unsigned char poseidon_constants_2[] = {
0x00
};
unsigned char poseidon_constants_4[] = {
0x00
};
unsigned char poseidon_constants_8[] = {
0x00
};
unsigned char poseidon_constants_11[] = {
0x00
};
} // namespace poseidon_constants
#endif

View File

@@ -31,7 +31,7 @@ namespace poseidon {
}
template <typename S>
DEVICE_INLINE S sbox_alpha_five(S element)
__device__ __forceinline__ S sbox_alpha_five(S element)
{
S result = S::sqr(element);
result = S::sqr(result);
@@ -46,7 +46,7 @@ namespace poseidon {
__syncthreads();
typename S::Wide element_wide = S::mul_wide(shared_states[vec_number * T], matrix[element_number]);
UNROLL
#pragma unroll
for (int i = 1; i < T; i++) {
element_wide = element_wide + S::mul_wide(shared_states[vec_number * T + i], matrix[i * T + element_number]);
}
@@ -117,14 +117,14 @@ namespace poseidon {
typename S::Wide state_0_wide = S::mul_wide(element, sparse_matrix[0]);
UNROLL
#pragma unroll
for (int i = 1; i < T; i++) {
state_0_wide = state_0_wide + S::mul_wide(state[i], sparse_matrix[i]);
}
state[0] = S::reduce(state_0_wide);
UNROLL
#pragma unroll
for (int i = 1; i < T; i++) {
state[i] = state[i] + (element * sparse_matrix[T + i - 1]);
}
@@ -138,7 +138,7 @@ namespace poseidon {
if (idx >= number_of_states) { return; }
S state[T];
UNROLL
#pragma unroll
for (int i = 0; i < T; i++) {
state[i] = states[idx * T + i];
}
@@ -148,7 +148,7 @@ namespace poseidon {
rc_offset++;
}
UNROLL
#pragma unroll
for (int i = 0; i < T; i++) {
states[idx * T + i] = state[i];
}

View File

@@ -20,7 +20,7 @@ namespace poseidon {
/**
* For most of the Poseidon configurations this is the case
* TODO: Add support for different full rounds numbers
* To-do: Add support for different full rounds numbers
*/
const int FULL_ROUNDS_DEFAULT = 4;

View File

@@ -1,8 +1,8 @@
// #define DEBUG
#define CURVE_ID 2
#include "curves/curve_config.cuh"
#include "utils/device_context.cuh"
#include "../../curves/curve_config.cuh"
#include "../../utils/device_context.cuh"
#include "poseidon.cu"
#ifndef __CUDA_ARCH__

View File

@@ -2,8 +2,8 @@
#define MERKLE_DEBUG
#define CURVE_ID 2
#include "curves/curve_config.cuh"
#include "appUtils/poseidon/poseidon.cu"
#include "../../curves/curve_config.cuh"
#include "../poseidon/poseidon.cu"
#include "merkle.cu"
#ifndef __CUDA_ARCH__

View File

@@ -1,11 +0,0 @@
#if defined(DEVMODE) || defined(DEBUG)
#define INLINE_MACRO
#define UNROLL
#else
#define INLINE_MACRO __forceinline__
#define UNROLL #pragma unroll
#endif
#define HOST_INLINE __host__ INLINE_MACRO
#define DEVICE_INLINE __device__ INLINE_MACRO
#define HOST_DEVICE_INLINE __host__ __device__ INLINE_MACRO

View File

@@ -2,7 +2,7 @@
#ifndef BLS12_377_PARAMS_H
#define BLS12_377_PARAMS_H
#include "utils/storage.cuh"
#include "../utils/storage.cuh"
namespace bls12_377 {
struct fp_config {

View File

@@ -2,7 +2,7 @@
#ifndef BLS12_381_PARAMS_H
#define BLS12_381_PARAMS_H
#include "utils/storage.cuh"
#include "../utils/storage.cuh"
namespace bls12_381 {
struct fp_config {

View File

@@ -2,7 +2,7 @@
#ifndef BN254_PARAMS_H
#define BN254_PARAMS_H
#include "utils/storage.cuh"
#include "../utils/storage.cuh"
namespace bn254 {
struct fp_config {

View File

@@ -2,12 +2,9 @@
#ifndef BW6_761_PARAMS_H
#define BW6_761_PARAMS_H
#include "utils/storage.cuh"
#include "bls12_377_params.cuh"
#include "../utils/storage.cuh"
namespace bw6_761 {
typedef bls12_377::fq_config fp_config;
struct fq_config {
static constexpr unsigned limbs_count = 24;
static constexpr unsigned modulus_bit_count = 761;

View File

@@ -8,10 +8,10 @@
#define BW6_761 4
#define GRUMPKIN 5
#include "primitives/field.cuh"
#include "primitives/projective.cuh"
#include "../primitives/field.cuh"
#include "../primitives/projective.cuh"
#if defined(G2_DEFINED)
#include "primitives/extension_field.cuh"
#include "../primitives/extension_field.cuh"
#endif
#if CURVE_ID == BN254
@@ -24,6 +24,7 @@ using namespace bls12_381;
#include "bls12_377_params.cuh"
using namespace bls12_377;
#elif CURVE_ID == BW6_761
#include "bls12_377_params.cuh"
#include "bw6_761_params.cuh"
using namespace bw6_761;
#elif CURVE_ID == GRUMPKIN
@@ -38,6 +39,10 @@ using namespace grumpkin;
* with the `-DCURVE` env variable passed during build.
*/
namespace curve_config {
#if CURVE_ID == BW6_761
typedef bls12_377::fq_config fp_config;
#endif
/**
* Scalar field of the curve. Is always a prime field.
*/

View File

@@ -2,7 +2,7 @@
#ifndef GRUMPKIN_PARAMS_H
#define GRUMPKIN_PARAMS_H
#include "utils/storage.cuh"
#include "../utils/storage.cuh"
#include "bn254_params.cuh"
namespace grumpkin {

View File

@@ -33,4 +33,4 @@ public:
os << "x: " << point.x << "; y: " << point.y;
return os;
}
};
};

View File

@@ -1,7 +1,10 @@
#pragma once
#include "field.cuh"
#include "common.cuh"
#define HOST_INLINE __host__ __forceinline__
#define DEVICE_INLINE __device__ __forceinline__
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
template <typename CONFIG>
class ExtensionField

View File

@@ -2,7 +2,7 @@
#include "field.cuh"
#include "utils/utils.h"
using namespace curve_config;
#define scalar_t curve_config::scalar_t
extern "C" void CONCAT_EXPAND(CURVE, GenerateScalars)(scalar_t* scalars, int size)
{

View File

@@ -18,16 +18,20 @@
#pragma once
#include "utils/error_handler.cuh"
#include "utils/host_math.cuh"
#include "utils/ptx.cuh"
#include "utils/storage.cuh"
#include "../utils/error_handler.cuh"
#include "../utils/host_math.cuh"
#include "../utils/ptx.cuh"
#include "../utils/storage.cuh"
#include <iomanip>
#include <iostream>
#include <random>
#include <sstream>
#include <string>
#define HOST_INLINE __host__ __forceinline__
#define DEVICE_INLINE __device__ __forceinline__
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
template <class CONFIG>
class Field
{
@@ -126,7 +130,7 @@ public:
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#pragma unroll
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
@@ -137,7 +141,7 @@ public:
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#pragma unroll
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
@@ -148,7 +152,7 @@ public:
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#pragma unroll
#endif
for (unsigned i = 0; i < TLC; i++) {
#ifdef __CUDA_ARCH__
@@ -240,7 +244,7 @@ public:
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t
static constexpr __device__ __forceinline__ uint32_t
add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
{
r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
@@ -323,7 +327,7 @@ public:
static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
UNROLL
#pragma unroll
for (size_t i = 0; i < n; i += 2) {
acc[i] = ptx::mul_lo(a[i], bi);
acc[i + 1] = ptx::mul_hi(a[i], bi);
@@ -332,7 +336,7 @@ public:
static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
{
UNROLL
#pragma unroll
for (size_t i = start_i; i < n; i += 2) {
acc[i] = ptx::mul_lo(a[i], bi);
acc[i + 1] = ptx::mul_hi(a[i], bi);
@@ -340,14 +344,14 @@ public:
}
template <bool CARRY_IN = false>
static DEVICE_INLINE void
static __device__ __forceinline__ void
cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0)
{
if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry);
acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]);
acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
UNROLL
#pragma unroll
for (size_t i = 2; i < n; i += 2) {
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
@@ -355,7 +359,7 @@ public:
}
template <bool EVEN_PHASE>
static DEVICE_INLINE void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
static __device__ __forceinline__ void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
if (EVEN_PHASE) {
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -364,14 +368,14 @@ public:
acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]);
}
UNROLL
#pragma unroll
for (size_t i = 2; i < n; i += 2) {
acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
}
}
static DEVICE_INLINE void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
static __device__ __forceinline__ void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
if (n > 1)
acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -379,7 +383,7 @@ public:
acc[0] = ptx::mad_lo(a[0], bi, acc[0]);
size_t i;
UNROLL
#pragma unroll
for (i = 1; i < n - 1; i += 2) {
acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]);
if (i == n - 2)
@@ -391,7 +395,7 @@ public:
}
template <bool CARRY_OUT = false, bool CARRY_IN = false>
static DEVICE_INLINE uint32_t mad_row(
static __device__ __forceinline__ uint32_t mad_row(
uint32_t* odd,
uint32_t* even,
const uint32_t* a,
@@ -416,7 +420,8 @@ public:
}
template <bool EVEN_PHASE>
static DEVICE_INLINE void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
static __device__ __forceinline__ void
mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
cmad_n_msb<!EVEN_PHASE>(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2);
odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0);
@@ -425,7 +430,8 @@ public:
odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0);
}
static DEVICE_INLINE void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
static __device__ __forceinline__ void
mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
{
// bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field
// modulus)
@@ -436,12 +442,12 @@ public:
return;
}
static DEVICE_INLINE uint32_t
static __device__ __forceinline__ uint32_t
mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
{
acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);
UNROLL
#pragma unroll
for (size_t i = 1; i < n - 1; i += 2) {
acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]);
acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]);
@@ -464,7 +470,8 @@ public:
* \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots +
* (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$.
*/
static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
static __device__ __forceinline__ void
multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
@@ -475,7 +482,7 @@ public:
odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
size_t i;
UNROLL
#pragma unroll
for (i = 2; i < TLC - 1; i += 2) {
mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
@@ -496,7 +503,7 @@ public:
* is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
* limb products are included.
*/
static DEVICE_INLINE void
static __device__ __forceinline__ void
multiply_and_add_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs)
{
const uint32_t* a = as.limbs;
@@ -516,7 +523,7 @@ public:
mul_n(odd, a + 1, b[0], TLC - 1);
}
mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
UNROLL
#pragma unroll
for (i = 2; i < TLC - 1; i += 2) {
mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
@@ -538,7 +545,7 @@ public:
* that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the
* result inside TLC limbs and ignore the carries from the highest limb.
*/
static DEVICE_INLINE void
static __device__ __forceinline__ void
multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2)
{
__align__(16) uint32_t odd[TLC - 2];
@@ -546,7 +553,7 @@ public:
uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]);
size_t i;
UNROLL
#pragma unroll
for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
carry = mad_row<true, false>(
&even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry);
@@ -567,7 +574,7 @@ public:
* This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs
* long. The usual schoolbook algorithm is used.
*/
static DEVICE_INLINE void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
static __device__ __forceinline__ void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
{
__align__(16) uint32_t odd[TLC - 2];
mul_n(even, a, b[0], TLC >> 1);
@@ -575,7 +582,7 @@ public:
mad_row(&even[2], &odd[0], a, b[1], TLC >> 1);
size_t i;
UNROLL
#pragma unroll
for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
mad_row(&odd[i], &even[i], a, b[i], TLC >> 1);
mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1);
@@ -673,7 +680,7 @@ public:
HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width)
{
const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32;
const uint32_t shift_bits = (digit_num * digit_width) % 32;
@@ -810,7 +817,7 @@ public:
const uint32_t* x = xs.limbs_storage.limbs;
const uint32_t* y = ys.limbs_storage.limbs;
uint32_t limbs_or = x[0] ^ y[0];
UNROLL
#pragma unroll
for (unsigned i = 1; i < TLC; i++)
limbs_or |= x[i] ^ y[i];
return limbs_or == 0;
@@ -829,7 +836,7 @@ public:
Field mul = multiplier;
static bool is_u32 = true;
#ifdef __CUDA_ARCH__
UNROLL
#pragma unroll
#endif
for (unsigned i = 1; i < TLC; i++)
is_u32 &= (mul.limbs_storage.limbs[i] == 0);
@@ -845,7 +852,7 @@ public:
T temp = xs;
bool is_zero = true;
#ifdef __CUDA_ARCH__
UNROLL
#pragma unroll
#endif
for (unsigned i = 0; i < 32; i++) {
if (multiplier & (1 << i)) {
@@ -895,7 +902,7 @@ public:
Field rs = {};
uint32_t* r = rs.limbs_storage.limbs;
#ifdef __CUDA_ARCH__
UNROLL
#pragma unroll
#endif
for (unsigned i = 0; i < TLC - 1; i++) {
#ifdef __CUDA_ARCH__

View File

@@ -3,7 +3,9 @@
#include <cuda.h>
#include "utils/utils.h"
using namespace curve_config;
#define projective_t curve_config::projective_t // TODO: global to avoid lengthy texts
#define affine_t curve_config::affine_t
#define point_field_t curve_config::point_field_t
extern "C" bool CONCAT_EXPAND(CURVE, Eq)(projective_t* point1, projective_t* point2)
{
@@ -31,7 +33,9 @@ extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int
#if defined(G2_DEFINED)
using namespace curve_config;
#define g2_projective_t curve_config::g2_projective_t
#define g2_affine_t curve_config::g2_affine_t
#define g2_point_field_t curve_config::g2_point_field_t
extern "C" bool CONCAT_EXPAND(CURVE, G2Eq)(g2_projective_t* point1, g2_projective_t* point2)
{

View File

@@ -8,9 +8,6 @@ class Projective
friend Affine<FF>;
public:
static constexpr unsigned SCALAR_FF_NBITS = SCALAR_FF::NBITS;
static constexpr unsigned FF_NBITS = FF::NBITS;
FF x;
FF y;
FF z;
@@ -39,34 +36,6 @@ public:
static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
static HOST_DEVICE_INLINE Projective dbl(const Projective& point)
{
const FF X = point.x;
const FF Y = point.y;
const FF Z = point.z;
// TODO: Change to efficient dbl once implemented for field.cuh
FF t0 = FF::sqr(Y); // 1. t0 ← Y · Y
FF Z3 = t0 + t0; // 2. Z3 ← t0 + t0
Z3 = Z3 + Z3; // 3. Z3 ← Z3 + Z3
Z3 = Z3 + Z3; // 4. Z3 ← Z3 + Z3
FF t1 = Y * Z; // 5. t1 ← Y · Z
FF t2 = FF::sqr(Z); // 6. t2 ← Z · Z
t2 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t2)); // 7. t2 ← b3 · t2
FF X3 = t2 * Z3; // 8. X3 ← t2 · Z3
FF Y3 = t0 + t2; // 9. Y3 ← t0 + t2
Z3 = t1 * Z3; // 10. Z3 ← t1 · Z3
t1 = t2 + t2; // 11. t1 ← t2 + t2
t2 = t1 + t2; // 12. t2 ← t1 + t2
t0 = t0 - t2; // 13. t0 ← t0 t2
Y3 = t0 * Y3; // 14. Y3 ← t0 · Y3
Y3 = X3 + Y3; // 15. Y3 ← X3 + Y3
t1 = X * Y; // 16. t1 ← X · Y
X3 = t0 * t1; // 17. X3 ← t0 · t1
X3 = X3 + X3; // 18. X3 ← X3 + X3
return {X3, Y3, Z3};
}
friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
{
const FF X1 = p1.x; // < 2
@@ -165,7 +134,7 @@ public:
{
Projective res = zero();
#ifdef __CUDA_ARCH__
UNROLL
#pragma unroll
#endif
for (int i = 0; i < SCALAR_FF::NBITS; i++) {
if (i > 0) { res = res + res; }

View File

@@ -1,6 +1,13 @@
#pragma once
#include "curves/curve_config.cuh"
#ifndef G2_DEFINED
#define G2_DEFINED
#include "../curves/curve_config.cuh"
#include "extension_field.cuh"
#include "projective.cuh"
#endif
using namespace curve_config;

View File

@@ -1,4 +1,4 @@
#include "utils/error_handler.cuh" // Include your error handling header file
#include "../utils/error_handler.cuh" // Include your error handling header file
#include <gtest/gtest.h>
__global__ void a_kernel_with_conditional_sticky_error(bool is_failing)

View File

@@ -1,4 +1,4 @@
#include "utils/error_handler.cuh" // Include your error handling header file
#include "../utils/error_handler.cuh" // Include your error handling header file
#include <gtest/gtest.h>
class IcicleErrorTest : public ::testing::Test

View File

@@ -1,4 +1,4 @@
#include "primitives/test_kernels.cuh"
#include "../primitives/test_kernels.cuh"
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <iostream>

View File

@@ -4,7 +4,7 @@
#include <cstdint>
#include <cuda_runtime.h>
#include "common.cuh"
namespace host_math {
// return x + y with uint32_t operands
@@ -67,9 +67,9 @@ namespace host_math {
struct carry_chain {
unsigned index;
constexpr HOST_INLINE carry_chain() : index(0) {}
constexpr __host__ __forceinline__ carry_chain() : index(0) {}
HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
__host__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
{
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
@@ -82,7 +82,7 @@ namespace host_math {
return host_math::addc(x, y, carry);
}
HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
__host__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
{
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)

View File

@@ -9,14 +9,14 @@ namespace mont {
#define MAX_THREADS_PER_BLOCK 256
template <typename E, bool is_into>
__global__ void MontgomeryKernel(const E* input, int n, E* output)
__global__ void MontgomeryKernel(E* input, int n, E* output)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { output[tid] = is_into ? E::ToMontgomery(input[tid]) : E::FromMontgomery(input[tid]); }
}
template <typename E, bool is_into>
cudaError_t ConvertMontgomery(const E* d_input, int n, cudaStream_t stream, E* d_output)
cudaError_t ConvertMontgomery(E* d_input, int n, cudaStream_t stream, E* d_output)
{
// Set the grid and block dimensions
int num_threads = MAX_THREADS_PER_BLOCK;
@@ -29,13 +29,13 @@ namespace mont {
} // namespace
template <typename E>
cudaError_t ToMontgomery(const E* d_input, int n, cudaStream_t stream, E* d_output)
cudaError_t ToMontgomery(E* d_input, int n, cudaStream_t stream, E* d_output)
{
return ConvertMontgomery<E, true>(d_input, n, stream, d_output);
}
template <typename E>
cudaError_t FromMontgomery(const E* d_input, int n, cudaStream_t stream, E* d_output)
cudaError_t FromMontgomery(E* d_input, int n, cudaStream_t stream, E* d_output)
{
return ConvertMontgomery<E, false>(d_input, n, stream, d_output);
}

63
icicle/utils/objects.cuh Normal file
View File

@@ -0,0 +1,63 @@
#pragma once
template <class F>
class Element
{
public:
int v;
__device__ __host__ Element<F>() { v = 0; }
__device__ __host__ Element<F>(int r)
{
v = r % F::q;
if (r == F::q) v = F::q;
}
__device__ __host__ Element<F> operator+(Element<F> const& obj)
{
Element<F> res;
res.v = (v + obj.v) % F::q;
return res;
}
__device__ __host__ Element<F> operator-(Element<F> const& obj)
{
Element<F> res;
res.v = (v - obj.v) % F::q;
if (res.v < 0) { res.v = F::q + res.v; }
return res;
}
};
template <class F>
class Scalar
{
public:
int v;
__device__ __host__ Scalar<F>() { v = 0; }
__device__ __host__ Scalar<F>(int r) { v = r % F::q; }
__device__ __host__ Scalar<F> operator+(Scalar<F> const& obj)
{
Scalar<F> res;
res.v = (v + obj.v) % F::q;
return res;
}
__device__ __host__ Scalar<F> operator*(Scalar<F> const& obj)
{
Scalar<F> res;
res.v = (v * obj.v) % F::q;
return res;
}
__device__ __host__ Element<F> operator*(Element<F> const& obj)
{
Element<F> res;
res.v = (v * obj.v) % F::q;
return res;
}
Scalar<F> operator-(Scalar<F> const& obj)
{
Scalar<F> res;
res.v = (v - obj.v) % F::q;
if (res.v < 0) { res.v = F::q + res.v; }
return res;
}
bool operator<(Scalar<F> const& obj) { return v < obj.v; }
static Scalar<F> one() { return Scalar<F>(1); }
static Scalar<F> zero() { return Scalar<F>(0); }
};

View File

@@ -1,3 +1,4 @@
// TODO: remove this file, seems working without it
// based on https://leimao.github.io/blog/CUDA-Shared-Memory-Templated-Kernel/
// may be outdated, but only worked like that
@@ -58,7 +59,7 @@
#ifndef _SHAREDMEM_H_
#define _SHAREDMEM_H_
#include "curves/curve_config.cuh"
#include "../curves/curve_config.cuh"
/** @brief Wrapper class for templatized dynamic shared memory arrays.
*

View File

@@ -2,7 +2,17 @@
#ifndef UTILS_KERNELS_H
#define UTILS_KERNELS_H
#include "utils_kernels.cuh"
namespace utils_internal {
// TODO: weird linking issue - only works in headers
// template <typename E, typename S>
// __global__ void NormalizeKernel(E* arr, S scalar, unsigned n)
// {
// int tid = blockIdx.x * blockDim.x + threadIdx.x;
// if (tid < n) { arr[tid] = scalar * arr[tid]; }
// }
template <typename E, typename S>
__global__ void NormalizeKernel(E* arr, S scalar, int n)
{
@@ -12,7 +22,7 @@ namespace utils_internal {
template <typename E, typename S>
__global__ void BatchMulKernel(
const E* in_vec,
E* in_vec,
int n_elements,
int batch_size,
S* scalar_vec,

View File

@@ -2,7 +2,7 @@
#include <stdexcept>
#include "vec_ops.cuh"
#include "curves/curve_config.cuh"
#include "../curves/curve_config.cuh"
#include "device_context.cuh"
#include "mont.cuh"
#include "utils/utils.h"
@@ -33,14 +33,6 @@ namespace vec_ops {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
}
template <typename E>
__global__ void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= row_size * column_size) return;
out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
}
} // namespace
template <typename E, void (*Kernel)(E*, E*, int, E*)>
@@ -109,48 +101,6 @@ namespace vec_ops {
return VecOp<E, SubKernel>(vec_a, vec_b, n, config, result);
}
template <typename E>
cudaError_t transpose_matrix(
const E* mat_in,
E* mat_out,
uint32_t row_size,
uint32_t column_size,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async)
{
int number_of_threads = MAX_THREADS_PER_BLOCK;
int number_of_blocks = (row_size * column_size + number_of_threads - 1) / number_of_threads;
cudaStream_t stream = ctx.stream;
const E* d_mat_in;
E* d_allocated_input = nullptr;
E* d_mat_out;
if (!on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, row_size * column_size * sizeof(E), ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(
d_allocated_input, mat_in, row_size * column_size * sizeof(E), cudaMemcpyHostToDevice, ctx.stream));
CHK_IF_RETURN(cudaMallocAsync(&d_mat_out, row_size * column_size * sizeof(E), ctx.stream));
d_mat_in = d_allocated_input;
} else {
d_mat_in = mat_in;
d_mat_out = mat_out;
}
transpose_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(d_mat_in, d_mat_out, row_size, column_size);
if (!on_device) {
CHK_IF_RETURN(
cudaMemcpyAsync(mat_out, d_mat_out, row_size * column_size * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_mat_out, ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, ctx.stream));
}
if (!is_async) return CHK_STICKY(cudaStreamSynchronize(ctx.stream));
return CHK_LAST();
}
/**
* Extern version of [Mul](@ref Mul) function with the template parameters
* `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
@@ -196,21 +146,4 @@ namespace vec_ops {
return Sub<curve_config::scalar_t>(vec_a, vec_b, n, config, result);
}
/**
* Extern version of transpose_batch function with the template parameter
* `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, TransposeMatrix)(
const curve_config::scalar_t* input,
uint32_t row_size,
uint32_t column_size,
curve_config::scalar_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async)
{
return transpose_matrix<curve_config::scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
}
} // namespace vec_ops

View File

@@ -96,29 +96,6 @@ namespace vec_ops {
template <typename E>
cudaError_t Sub(E* vec_a, E* vec_b, int n, VecOpsConfig<E>& config, E* result);
/**
* Transposes an input matrix out-of-place inside GPU.
* for example: for ([a[0],a[1],a[2],a[3]], 2, 2) it returns
* [a[0],a[2],a[1],a[3]].
* @param mat_in array of some object of type E of size row_size * column_size.
* @param arr_out buffer of the same size as `mat_in` to write the transpose matrix into.
* @param row_size size of rows.
* @param column_size size of columns.
* @param ctx Device context.
* @param on_device Whether the input and output are on device.
* @param is_async Whether to run the vector operations asynchronously.
* @tparam E The type of elements `mat_in' and `mat_out`.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
template <typename E>
cudaError_t transpose_batch(
const E* mat_in,
E* mat_out,
uint32_t row_size,
uint32_t column_size,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
} // namespace vec_ops
#endif

View File

@@ -36,11 +36,11 @@ fi
# Run cargo fmt on Rust files
cd wrappers/rust
if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]];
if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]];
then
echo "🚨 There are Rust files that need formatting."
echo "Please go to wrappers/rust and format the Rust files using the following command:"
echo "find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --"
echo "Please format the Rust files using the following command:"
echo "find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --"
status=1
fi

View File

@@ -1,23 +0,0 @@
$G2_DEFINED = "OFF"
if ($args.Count -gt 1) {
$G2_DEFINED = "ON"
}
$BUILD_DIR = (Get-Location).Path + "\..\icicle\build"
$SUPPORTED_CURVES = @("bn254", "bls12_377", "bls12_381", "bw6_761")
if ($args[0] -eq "all") {
$BUILD_CURVES = $SUPPORTED_CURVES
} else {
$BUILD_CURVES = @($args[0])
}
Set-Location "../../icicle"
New-Item -ItemType Directory -Path "build" -Force
foreach ($CURVE in $BUILD_CURVES) {
cmake -DCURVE:STRING=$CURVE -DG2_DEFINED:STRING=$G2_DEFINED -DCMAKE_BUILD_TYPE:STRING=Release -S . -B build
cmake --build build
}

View File

@@ -1,18 +1,12 @@
#!/bin/bash
G2_DEFINED=OFF
ECNTT_DEFINED=OFF
if [[ $2 == "ON" ]]
if [[ $2 ]]
then
G2_DEFINED=ON
fi
if [[ $3 ]]
then
ECNTT_DEFINED=ON
fi
BUILD_DIR=$(realpath "$PWD/../../icicle/build")
SUPPORTED_CURVES=("bn254" "bls12_377" "bls12_381" "bw6_761")
@@ -28,6 +22,6 @@ mkdir -p build
for CURVE in "${BUILD_CURVES[@]}"
do
cmake -DCURVE=$CURVE -DG2_DEFINED=$G2_DEFINED -DECNTT_DEFINED=$ECNTT_DEFINED -DCMAKE_BUILD_TYPE=Release -S . -B build
cmake --build build -j8
cmake -DCURVE=$CURVE -DG2_DEFINED=$G2_DEFINED -DCMAKE_BUILD_TYPE=Release -S . -B build
cmake --build build
done

View File

@@ -1,7 +1,7 @@
package core
import (
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
type IcicleErrorCode int
@@ -16,13 +16,13 @@ const (
type IcicleError struct {
IcicleErrorCode IcicleErrorCode
CudaErrorCode cr.CudaError
CudaErrorCode cuda_runtime.CudaError
reason string
}
func FromCudaError(error cr.CudaError) (err IcicleError) {
func FromCudaError(error cuda_runtime.CudaError) (err IcicleError) {
switch error {
case cr.CudaSuccess:
case cuda_runtime.CudaSuccess:
err.IcicleErrorCode = IcicleSuccess
default:
err.IcicleErrorCode = InternalCudaError
@@ -38,6 +38,6 @@ func FromCodeAndReason(code IcicleErrorCode, reason string) IcicleError {
return IcicleError{
IcicleErrorCode: code,
reason: reason,
CudaErrorCode: cr.CudaErrorUnknown,
CudaErrorCode: cuda_runtime.CudaErrorUnknown,
}
}

View File

@@ -3,12 +3,12 @@ package core
import (
"fmt"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
type MSMConfig struct {
/// Details related to the device such as its id and stream.
Ctx cr.DeviceContext
Ctx cuda_runtime.DeviceContext
pointsSize int32
@@ -55,8 +55,13 @@ type MSMConfig struct {
IsAsync bool
}
// type MSM interface {
// Msm(scalars, points *cuda_runtime.HostOrDeviceSlice, cfg *MSMConfig, results *cuda_runtime.HostOrDeviceSlice) cuda_runtime.CudaError
// GetDefaultMSMConfig() MSMConfig
// }
func GetDefaultMSMConfig() MSMConfig {
ctx, _ := cr.GetDefaultDeviceContext()
ctx, _ := cuda_runtime.GetDefaultDeviceContext()
return MSMConfig{
ctx, // Ctx
0, // pointsSize
@@ -76,7 +81,7 @@ func GetDefaultMSMConfig() MSMConfig {
}
func MsmCheck(scalars HostOrDeviceSlice, points HostOrDeviceSlice, cfg *MSMConfig, results HostOrDeviceSlice) {
scalarsLength, pointsLength, resultsLength := scalars.Len(), points.Len()/int(cfg.PrecomputeFactor), results.Len()
scalarsLength, pointsLength, resultsLength := scalars.Len(), points.Len(), results.Len()
if scalarsLength%pointsLength != 0 {
errorString := fmt.Sprintf(
"Number of points %d does not divide the number of scalars %d",
@@ -99,15 +104,3 @@ func MsmCheck(scalars HostOrDeviceSlice, points HostOrDeviceSlice, cfg *MSMConfi
cfg.arePointsOnDevice = points.IsOnDevice()
cfg.areResultsOnDevice = results.IsOnDevice()
}
func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outputBases DeviceSlice) {
outputBasesLength, pointsLength := outputBases.Len(), points.Len()
if outputBasesLength != pointsLength*int(precomputeFactor) {
errorString := fmt.Sprintf(
"Precompute factor is probably incorrect: expected %d but got %d",
outputBasesLength/pointsLength,
precomputeFactor,
)
panic(errorString)
}
}

View File

@@ -4,13 +4,13 @@ import (
"testing"
"github.com/ingonyama-zk/icicle/wrappers/golang/core/internal"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/stretchr/testify/assert"
)
func TestMSMDefaultConfig(t *testing.T) {
ctx, _ := cr.GetDefaultDeviceContext()
ctx, _ := cuda_runtime.GetDefaultDeviceContext()
expected := MSMConfig{
ctx, // Ctx
0, // pointsSize

View File

@@ -3,67 +3,53 @@ package core
import (
"fmt"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
type NTTDir int8
const (
KForward NTTDir = iota
KInverse
KInverse NTTDir = 1
)
type Ordering uint32
const (
KNN Ordering = iota
KNR
KRN
KRR
KNM
KMN
)
type NttAlgorithm uint32
const (
Auto NttAlgorithm = iota
Radix2
MixedRadix
KNR Ordering = 1
KRN Ordering = 2
KRR Ordering = 3
KNM Ordering = 4
KMN Ordering = 5
)
type NTTConfig[T any] struct {
/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
Ctx cr.DeviceContext
Ctx cuda_runtime.DeviceContext
/// Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()` (corresponding to no coset being used).
CosetGen T
/// The number of NTTs to compute. Default value: 1.
BatchSize int32
/// If true the function will compute the NTTs over the columns of the input matrix and not over the rows.
ColumnsBatch bool
/// Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value: `Ordering::kNN`.
Ordering Ordering
areInputsOnDevice bool
areOutputsOnDevice bool
/// Whether to run the NTT asynchronously. If set to `true`, the NTT function will be non-blocking and you'd need to synchronize
/// it explicitly by running `stream.synchronize()`. If set to false, the NTT function will block the current CPU thread.
IsAsync bool
NttAlgorithm NttAlgorithm /**< Explicitly select the NTT algorithm. Default value: Auto (the implementation
selects radix-2 or mixed-radix algorithm based on heuristics). */
IsAsync bool
}
func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T] {
ctx, _ := cr.GetDefaultDeviceContext()
ctx, _ := cuda_runtime.GetDefaultDeviceContext()
return NTTConfig[T]{
ctx, // Ctx
cosetGen, // CosetGen
1, // BatchSize
false, // ColumnsBatch
KNN, // Ordering
false, // areInputsOnDevice
false, // areOutputsOnDevice
false, // IsAsync
Auto,
}
}

View File

@@ -5,7 +5,7 @@ import (
"testing"
"github.com/ingonyama-zk/icicle/wrappers/golang/core/internal"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/stretchr/testify/assert"
)
@@ -14,17 +14,15 @@ func TestNTTDefaultConfig(t *testing.T) {
cosetGenField.One()
var cosetGen [1]uint32
copy(cosetGen[:], cosetGenField.GetLimbs())
ctx, _ := cr.GetDefaultDeviceContext()
ctx, _ := cuda_runtime.GetDefaultDeviceContext()
expected := NTTConfig[[1]uint32]{
ctx, // Ctx
cosetGen, // CosetGen
1, // BatchSize
false, // ColumnsBatch
KNN, // Ordering
false, // areInputsOnDevice
false, // areOutputsOnDevice
false, // IsAsync
Auto, // NttAlgorithm
}
actual := GetDefaultNTTConfig(cosetGen)

View File

@@ -3,7 +3,7 @@ package core
import (
"unsafe"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
type HostOrDeviceSlice interface {
@@ -43,123 +43,47 @@ func (d DeviceSlice) IsOnDevice() bool {
return true
}
func (d DeviceSlice) GetDeviceId() int {
return cr.GetDeviceFromPointer(d.inner)
}
// CheckDevice is used to ensure that the DeviceSlice about to be used resides on the currently set device
func (d DeviceSlice) CheckDevice() {
if currentDeviceId, err := cr.GetDevice(); err != cr.CudaSuccess || d.GetDeviceId() != currentDeviceId {
panic("Attempt to use DeviceSlice on a different device")
}
}
func (d *DeviceSlice) Range(start, end int, endInclusive bool) DeviceSlice {
if end <= start {
panic("Cannot have negative or zero size slices")
}
if end >= d.length {
panic("Cannot increase slice size from Range")
}
var newSlice DeviceSlice
switch {
case start < 0:
panic("Negative value for start is not supported")
case start == 0:
newSlice = d.RangeTo(end, endInclusive)
case start > 0:
tempSlice := d.RangeFrom(start)
newSlice = tempSlice.RangeTo(end-start, endInclusive)
}
return newSlice
}
func (d *DeviceSlice) RangeTo(end int, inclusive bool) DeviceSlice {
if end <= 0 {
panic("Cannot have negative or zero size slices")
}
if end >= d.length {
panic("Cannot increase slice size from Range")
}
var newSlice DeviceSlice
sizeOfElement := d.capacity / d.length
newSlice.length = end
if inclusive {
newSlice.length += 1
}
newSlice.capacity = newSlice.length * sizeOfElement
newSlice.inner = d.inner
return newSlice
}
func (d *DeviceSlice) RangeFrom(start int) DeviceSlice {
if start >= d.length {
panic("Cannot have negative or zero size slices")
}
if start < 0 {
panic("Negative value for start is not supported")
}
var newSlice DeviceSlice
sizeOfElement := d.capacity / d.length
newSlice.inner = unsafe.Pointer(uintptr(d.inner) + uintptr(start)*uintptr(sizeOfElement))
newSlice.length = d.length - start
newSlice.capacity = d.capacity - start*sizeOfElement
return newSlice
}
// TODO: change signature to be Malloc(element, numElements)
// calc size internally
func (d *DeviceSlice) Malloc(size, sizeOfElement int) (DeviceSlice, cr.CudaError) {
dp, err := cr.Malloc(uint(size))
func (d *DeviceSlice) Malloc(size, sizeOfElement int) (DeviceSlice, cuda_runtime.CudaError) {
dp, err := cuda_runtime.Malloc(uint(size))
d.inner = dp
d.capacity = size
d.length = size / sizeOfElement
return *d, err
}
func (d *DeviceSlice) MallocAsync(size, sizeOfElement int, stream cr.CudaStream) (DeviceSlice, cr.CudaError) {
dp, err := cr.MallocAsync(uint(size), stream)
func (d *DeviceSlice) MallocAsync(size, sizeOfElement int, stream cuda_runtime.CudaStream) (DeviceSlice, cuda_runtime.CudaError) {
dp, err := cuda_runtime.MallocAsync(uint(size), stream)
d.inner = dp
d.capacity = size
d.length = size / sizeOfElement
return *d, err
}
func (d *DeviceSlice) Free() cr.CudaError {
d.CheckDevice()
err := cr.Free(d.inner)
if err == cr.CudaSuccess {
func (d *DeviceSlice) Free() cuda_runtime.CudaError {
err := cuda_runtime.Free(d.inner)
if err == cuda_runtime.CudaSuccess {
d.length, d.capacity = 0, 0
d.inner = nil
}
return err
}
func (d *DeviceSlice) FreeAsync(stream cr.Stream) cr.CudaError {
d.CheckDevice()
err := cr.FreeAsync(d.inner, stream)
if err == cr.CudaSuccess {
d.length, d.capacity = 0, 0
d.inner = nil
}
return err
type HostSliceInterface interface {
Size() int
}
type HostSlice[T any] []T
type HostSlice[T HostSliceInterface] []T
func HostSliceFromElements[T any](elements []T) HostSlice[T] {
return elements
func HostSliceFromElements[T HostSliceInterface](elements []T) HostSlice[T] {
slice := make(HostSlice[T], len(elements))
copy(slice, elements)
return slice
}
func HostSliceWithValue[T any](underlyingValue T, size int) HostSlice[T] {
func HostSliceWithValue[T HostSliceInterface](underlyingValue T, size int) HostSlice[T] {
slice := make(HostSlice[T], size)
for i := range slice {
slice[i] = underlyingValue
@@ -185,7 +109,7 @@ func (h HostSlice[T]) IsOnDevice() bool {
}
func (h HostSlice[T]) SizeOfElement() int {
return int(unsafe.Sizeof(h[0]))
return h[0].Size()
}
func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *DeviceSlice {
@@ -193,47 +117,44 @@ func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *Devic
if shouldAllocate {
dst.Malloc(size, h.SizeOfElement())
}
dst.CheckDevice()
if size > dst.Cap() {
panic("Number of bytes to copy is too large for destination")
}
// hostSrc := unsafe.Pointer(h.AsPointer())
hostSrc := unsafe.Pointer(&h[0])
cr.CopyToDevice(dst.inner, hostSrc, uint(size))
cuda_runtime.CopyToDevice(dst.inner, hostSrc, uint(size))
dst.length = h.Len()
return dst
}
func (h HostSlice[T]) CopyToDeviceAsync(dst *DeviceSlice, stream cr.CudaStream, shouldAllocate bool) *DeviceSlice {
func (h HostSlice[T]) CopyToDeviceAsync(dst *DeviceSlice, stream cuda_runtime.CudaStream, shouldAllocate bool) *DeviceSlice {
size := h.Len() * h.SizeOfElement()
if shouldAllocate {
dst.MallocAsync(size, h.SizeOfElement(), stream)
}
dst.CheckDevice()
if size > dst.Cap() {
panic("Number of bytes to copy is too large for destination")
}
hostSrc := unsafe.Pointer(&h[0])
cr.CopyToDeviceAsync(dst.inner, hostSrc, uint(size), stream)
cuda_runtime.CopyToDeviceAsync(dst.inner, hostSrc, uint(size), stream)
dst.length = h.Len()
return dst
}
func (h HostSlice[T]) CopyFromDevice(src *DeviceSlice) {
src.CheckDevice()
if h.Len() != src.Len() {
panic("destination and source slices have different lengths")
}
bytesSize := src.Len() * h.SizeOfElement()
cr.CopyFromDevice(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize))
cuda_runtime.CopyFromDevice(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize))
}
func (h HostSlice[T]) CopyFromDeviceAsync(src *DeviceSlice, stream cr.Stream) {
src.CheckDevice()
func (h HostSlice[T]) CopyFromDeviceAsync(src *DeviceSlice, stream cuda_runtime.Stream) {
if h.Len() != src.Len() {
panic("destination and source slices have different lengths")
}
bytesSize := src.Len() * h.SizeOfElement()
cr.CopyFromDeviceAsync(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize), stream)
cuda_runtime.CopyFromDeviceAsync(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize), stream)
}

View File

@@ -81,10 +81,6 @@ func TestHostSlice(t *testing.T) {
hostSlice := HostSliceFromElements(randFields)
assert.Equal(t, hostSlice.Len(), 4)
assert.Equal(t, hostSlice.Cap(), 4)
hostSliceCasted := (HostSlice[internal.MockField])(randFields)
assert.Equal(t, hostSliceCasted.Len(), 4)
assert.Equal(t, hostSliceCasted.Cap(), 4)
}
func TestHostSliceIsEmpty(t *testing.T) {
@@ -194,31 +190,3 @@ func TestCopyToFromHostDeviceProjectivePoints(t *testing.T) {
assert.Equal(t, hostSlice, hostSlice2)
}
func TestSliceRanges(t *testing.T) {
var deviceSlice DeviceSlice
numPoints := 1 << 3
randProjectives := randomProjectivePoints(numPoints, fieldSize)
hostSlice := (HostSlice[internal.MockProjective])(randProjectives)
hostSlice.CopyToDevice(&deviceSlice, true)
// RangeFrom
var zeroProj internal.MockProjective
hostSliceRet := HostSliceWithValue[internal.MockProjective](zeroProj, numPoints-2)
deviceSliceRangeFrom := deviceSlice.RangeFrom(2)
hostSliceRet.CopyFromDevice(&deviceSliceRangeFrom)
assert.Equal(t, hostSlice[2:], hostSliceRet)
// RangeTo
deviceSliceRangeTo := deviceSlice.RangeTo(numPoints-3, true)
hostSliceRet.CopyFromDevice(&deviceSliceRangeTo)
assert.Equal(t, hostSlice[:6], hostSliceRet)
// Range
hostSliceRange := HostSliceWithValue[internal.MockProjective](zeroProj, numPoints-4)
deviceSliceRange := deviceSlice.Range(2, numPoints-3, true)
hostSliceRange.CopyFromDevice(&deviceSliceRange)
assert.Equal(t, hostSlice[2:6], hostSliceRange)
}

View File

@@ -1,74 +0,0 @@
package core
import (
"fmt"
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
)
type VecOps int
const (
Sub VecOps = iota
Add
Mul
)
type VecOpsConfig struct {
/*Details related to the device such as its id and stream. */
Ctx cr.DeviceContext
/* True if `a` is on device and false if it is not. Default value: false. */
isAOnDevice bool
/* True if `b` is on device and false if it is not. Default value: false. */
isBOnDevice bool
/* If true, output is preserved on device, otherwise on host. Default value: false. */
isResultOnDevice bool
/* True if `result` vector should be in Montgomery form and false otherwise. Default value: false. */
IsResultMontgomeryForm bool
/* Whether to run the vector operations asynchronously. If set to `true`, the function will be
* non-blocking and you'll need to synchronize it explicitly by calling
* `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
IsAsync bool
}
/**
* A function that returns the default value of [VecOpsConfig](@ref VecOpsConfig).
* @return Default value of [VecOpsConfig](@ref VecOpsConfig).
*/
func DefaultVecOpsConfig() VecOpsConfig {
ctx, _ := cr.GetDefaultDeviceContext()
config := VecOpsConfig{
ctx, // ctx
false, // isAOnDevice
false, // isBOnDevice
false, // isResultOnDevice
false, // IsResultMontgomeryForm
false, // IsAsync
}
return config
}
func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) {
aLen, bLen, outLen := a.Len(), b.Len(), out.Len()
if aLen != bLen {
errorString := fmt.Sprintf(
"a and b vector lengths %d; %d are not equal",
aLen,
bLen,
)
panic(errorString)
}
if aLen != outLen {
errorString := fmt.Sprintf(
"a and out vector lengths %d; %d are not equal",
aLen,
outLen,
)
panic(errorString)
}
cfg.isAOnDevice = a.IsOnDevice()
cfg.isBOnDevice = b.IsOnDevice()
cfg.isResultOnDevice = out.IsOnDevice()
}

View File

@@ -1,23 +0,0 @@
package core
import (
cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
"github.com/stretchr/testify/assert"
"testing"
)
func TestVecOpsDefaultConfig(t *testing.T) {
ctx, _ := cr.GetDefaultDeviceContext()
expected := VecOpsConfig{
ctx, // Ctx
false, // isAOnDevice
false, // isBOnDevice
false, // isResultOnDevice
false, // IsResultMontgomeryForm
false, // IsAsync
}
actual := DefaultVecOpsConfig()
assert.Equal(t, expected, actual)
}

View File

@@ -9,8 +9,6 @@ package cuda_runtime
*/
import "C"
import (
"fmt"
"runtime"
"unsafe"
)
@@ -19,28 +17,20 @@ type DeviceContext struct {
Stream *Stream // Assuming the type is provided by a CUDA binding crate
/// Index of the currently used GPU. Default value: 0.
deviceId uint
DeviceId uint
/// Mempool to use. Default value: 0.
Mempool MemPool // Assuming the type is provided by a CUDA binding crate
}
func (d DeviceContext) GetDeviceId() int {
return int(d.deviceId)
// TODO: use cuda_bindings.CudaMemPool as type
Mempool uint // Assuming the type is provided by a CUDA binding crate
}
func GetDefaultDeviceContext() (DeviceContext, CudaError) {
device, err := GetDevice()
if err != CudaSuccess {
panic(fmt.Sprintf("Could not get current device due to %v", err))
}
var defaultStream Stream
var defaultMempool MemPool
return DeviceContext{
&defaultStream,
uint(device),
defaultMempool,
0,
0,
}, CudaSuccess
}
@@ -57,78 +47,3 @@ func GetDeviceCount() (int, CudaError) {
err := C.cudaGetDeviceCount(cCount)
return count, (CudaError)(err)
}
func GetDevice() (int, CudaError) {
var device int
cDevice := (*C.int)(unsafe.Pointer(&device))
err := C.cudaGetDevice(cDevice)
return device, (CudaError)(err)
}
func GetDeviceFromPointer(ptr unsafe.Pointer) int {
var cCudaPointerAttributes CudaPointerAttributes
err := C.cudaPointerGetAttributes(&cCudaPointerAttributes, ptr)
if (CudaError)(err) != CudaSuccess {
panic("Could not get attributes of pointer")
}
return int(cCudaPointerAttributes.device)
}
// RunOnDevice forces the provided function to run all GPU related calls within it
// on the same host thread and therefore the same GPU device.
//
// NOTE: Goroutines launched within funcToRun are not bound to the
// same host thread as funcToRun and therefore not to the same GPU device.
// If that is a requirement, RunOnDevice should be called for each with the
// same deviceId as the original call.
//
// As an example:
//
// cr.RunOnDevice(i, func(args ...any) {
// defer wg.Done()
// cfg := GetDefaultMSMConfig()
// stream, _ := cr.CreateStream()
// for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
// size := 1 << power
//
// // This will always print "Inner goroutine device: 0"
// // go func () {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // }()
// // To force the above goroutine to same device as the wrapping function:
// // RunOnDevice(i, func(arg ...any) {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // })
//
// scalars := GenerateScalars(size)
// points := GenerateAffinePoints(size)
//
// var p Projective
// var out core.DeviceSlice
// _, e := out.MallocAsync(p.Size(), p.Size(), stream)
// assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
// cfg.Ctx.Stream = &stream
// cfg.IsAsync = true
//
// e = Msm(scalars, points, &cfg, out)
// assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
//
// outHost := make(core.HostSlice[Projective], 1)
//
// cr.SynchronizeStream(&stream)
// outHost.CopyFromDevice(&out)
// out.Free()
// // Check with gnark-crypto
// assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
// }
// }, i)
func RunOnDevice(deviceId int, funcToRun func(args ...any), args ...any) {
go func(id int) {
defer runtime.UnlockOSThread()
runtime.LockOSThread()
SetDevice(id)
funcToRun(args...)
}(deviceId)
}

View File

@@ -12,8 +12,6 @@ import (
"unsafe"
)
type MemPool = CudaMemPool
func Malloc(size uint) (unsafe.Pointer, CudaError) {
if size == 0 {
return nil, CudaErrorMemoryAllocation

View File

@@ -17,6 +17,3 @@ type CudaEvent C.cudaEvent_t
// CudaMemPool as declared in include/driver_types.h:2928
type CudaMemPool C.cudaMemPool_t
// CudaMemPool as declared in include/driver_types.h:2928
type CudaPointerAttributes = C.struct_cudaPointerAttributes

View File

@@ -146,12 +146,10 @@ func convertAffinePointsMontgomery(points *core.DeviceSlice, isInto bool) cr.Cud
}
func AffineToMontgomery(points *core.DeviceSlice) cr.CudaError {
points.CheckDevice()
return convertAffinePointsMontgomery(points, true)
}
func AffineFromMontgomery(points *core.DeviceSlice) cr.CudaError {
points.CheckDevice()
return convertAffinePointsMontgomery(points, false)
}
@@ -167,11 +165,9 @@ func convertProjectivePointsMontgomery(points *core.DeviceSlice, isInto bool) cr
}
func ProjectiveToMontgomery(points *core.DeviceSlice) cr.CudaError {
points.CheckDevice()
return convertProjectivePointsMontgomery(points, true)
}
func ProjectiveFromMontgomery(points *core.DeviceSlice) cr.CudaError {
points.CheckDevice()
return convertProjectivePointsMontgomery(points, false)
}

Some files were not shown because too many files have changed in this diff Show More