Compare commits

..

2 Commits

Author SHA1 Message Date
release-bot
aacec3f72f Bump rust crates' version
icicle-babybear@2.8.0
icicle-bls12-377@2.8.0
icicle-bls12-381@2.8.0
icicle-bn254@2.8.0
icicle-bw6-761@2.8.0
icicle-core@2.8.0
icicle-cuda-runtime@2.8.0
icicle-grumpkin@2.8.0
icicle-hash@2.8.0
icicle-m31@2.8.0
icicle-stark252@2.8.0

Generated by cargo-workspaces
2024-07-16 13:57:56 +00:00
ChickenLover
a8fa05d0e3 Feat/roman/hash docs (#556)
## Describe the changes

This PR...

## Linked Issues

Resolves #

---------

Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
2024-07-16 16:39:35 +03:00
17 changed files with 285 additions and 301 deletions

View File

@@ -14,9 +14,62 @@ At its core, Keccak consists of a permutation function operating on a state arra
## Using Keccak
ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree.
ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree or running multiple hashes in parallel.
### Supported Bindings
- [Golang](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/hash/keccak)
- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
### Example usage
This is an example of running 1024 Keccak-256 hashes in parallel, where input strings are of size 136 bytes:
```rust
use icicle_core::hash::HashConfig;
use icicle_cuda_runtime::memory::HostSlice;
use icicle_hash::keccak::keccak256;
let config = HashConfig::default();
let input_block_len = 136;
let number_of_hashes = 1024;
let preimages = vec![1u8; number_of_hashes * input_block_len];
let mut digests = vec![0u8; number_of_hashes * 64];
let preimages_slice = HostSlice::from_slice(&preimages);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
keccak256(
preimages_slice,
input_block_len as u32,
number_of_hashes as u32,
digests_slice,
&config,
)
.unwrap();
```
### Merkle Tree
You can build a keccak merkle tree using the corresponding functions:
```rust
use icicle_core::tree::{merkle_tree_digests_len, TreeBuilderConfig};
use icicle_cuda_runtime::memory::HostSlice;
use icicle_hash::keccak::build_keccak256_merkle_tree;
let mut config = TreeBuilderConfig::default();
config.arity = 2;
let height = 22;
let input_block_len = 136;
let leaves = vec![1u8; (1 << height) * input_block_len];
let mut digests = vec![0u64; merkle_tree_digests_len((height + 1) as u32, 2, 1)];
let leaves_slice = HostSlice::from_slice(&leaves);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
build_keccak256_merkle_tree(leaves_slice, digests_slice, height, input_block_len, &config).unwrap();
```
In the example above, a binary tree of height 22 is being built. Each leaf is considered to be a 136 byte long array. The leaves and digests are aligned in a flat array. You can also use keccak512 in `build_keccak512_merkle_tree` function.

View File

@@ -53,6 +53,7 @@ So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elem
### Supported Bindings
[`Go`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/golang/curves/bn254/poseidon/poseidon.go)
[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon)
### Constants
@@ -91,8 +92,6 @@ primitive_element = 7 # bls12-381
# primitive_element = 15 # bw6-761
```
We only support `alpha = 5` so if you want to use another alpha for S-box please reach out on discord or open a github issue.
### Rust API
This is the most basic way to use the Poseidon API.
@@ -101,71 +100,58 @@ This is the most basic way to use the Poseidon API.
let test_size = 1 << 10;
let arity = 2u32;
let ctx = get_default_device_context();
let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
let config = PoseidonConfig::default();
let poseidon = Poseidon::load(arity, &ctx).unwrap();
let config = HashConfig::default();
let inputs = vec![F::one(); test_size * arity as usize];
let outputs = vec![F::zero(); test_size];
let mut input_slice = HostOrDeviceSlice::on_host(inputs);
let mut output_slice = HostOrDeviceSlice::on_host(outputs);
poseidon_hash_many::<F>(
poseidon.hash_many::<F>(
&mut input_slice,
&mut output_slice,
test_size as u32,
arity as u32,
&constants,
1, // Output length
&config,
)
.unwrap();
```
The `PoseidonConfig::default()` can be modified, by default the inputs and outputs are set to be on `Host` for example.
The `HashConfig` can be modified, by default the inputs and outputs are set to be on `Host` for example.
```rust
impl<'a> Default for PoseidonConfig<'a> {
impl<'a> Default for HashConfig<'a> {
fn default() -> Self {
let ctx = get_default_device_context();
Self {
ctx,
are_inputs_on_device: false,
are_outputs_on_device: false,
input_is_a_state: false,
aligned: false,
loop_state: false,
is_async: false,
}
}
}
```
In the example above `load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.
In the example above `Poseidon::load(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.
```rust
let ctx = get_default_device_context();
let cargo_manifest_dir = env!("CARGO_MANIFEST_DIR");
let constants_file = PathBuf::from(cargo_manifest_dir)
.join("tests")
.join(format!("{}_constants.bin", field_prefix));
let mut constants_buf = vec![];
File::open(constants_file)
.unwrap()
.read_to_end(&mut constants_buf)
.unwrap();
let mut custom_constants = vec![];
for chunk in constants_buf.chunks(field_bytes) {
custom_constants.push(F::from_bytes_le(chunk));
}
let custom_constants = create_optimized_poseidon_constants::<F>(
arity as u32,
&ctx,
full_rounds_half,
partial_rounds,
&mut custom_constants,
)
.unwrap();
let custom_poseidon = Poseidon::new(
arity, // The arity of poseidon hash. The width will be equal to arity + 1
alpha, // The S-box power
full_rounds_half,
partial_rounds,
round_constants,
mds_matrix,
non_sparse_matrix,
sparse_matrices,
domain_tag,
ctx,
)
.unwrap();
```
## The Tree Builder
@@ -175,21 +161,34 @@ The tree builder allows you to build Merkle trees using Poseidon.
You can define both the tree's `height` and its `arity`. The tree `height` determines the number of layers in the tree, including the root and the leaf layer. The `arity` determines how many children each internal node can have.
```rust
let height = 20;
let arity = 2;
let leaves = vec![F::one(); 1 << (height - 1)];
let mut digests = vec![F::zero(); merkle_tree_digests_len(height, arity)];
let mut leaves_slice = HostOrDeviceSlice::on_host(leaves);
let ctx = get_default_device_context();
let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap()
use icicle_bn254::tree::Bn254TreeBuilder;
use icicle_bn254::poseidon::Poseidon;
let mut config = TreeBuilderConfig::default();
config.keep_rows = 1;
build_poseidon_merkle_tree::<F>(&mut leaves_slice, &mut digests, height, arity, &constants, &config).unwrap();
let arity = 2;
config.arity = arity as u32;
let input_block_len = arity;
let leaves = vec![F::one(); (1 << height) * arity];
let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];
println!("Root: {:?}", digests[0..1][0]);
let leaves_slice = HostSlice::from_slice(&leaves);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
let ctx = device_context::DeviceContext::default();
let hash = Poseidon::load(2, &ctx).unwrap();
let mut config = TreeBuilderConfig::default();
config.keep_rows = 5;
Bn254TreeBuilder::build_merkle_tree(
leaves_slice,
digests_slice,
height,
input_block_len,
&hash,
&hash,
&config,
)
.unwrap();
```
Similar to Poseidon, you can also configure the Tree Builder `TreeBuilderConfig::default()`

View File

@@ -0,0 +1,88 @@
# Poseidon2
[Poseidon2](https://eprint.iacr.org/2023/323) is a recently released optimized version of Poseidon1. The two versions differ in two crucial points. First, Poseidon is a sponge hash function, while Poseidon2 can be either a sponge or a compression function depending on the use case. Secondly, Poseidon2 is instantiated by new and more efficient linear layers with respect to Poseidon. These changes decrease the number of multiplications in the linear layer by up to 90% and the number of constraints in Plonk circuits by up to 70%. This makes Poseidon2 currently the fastest arithmetization-oriented hash function without lookups.
## Using Poseidon2
ICICLE Poseidon2 is implemented for GPU and parallelization is performed for each state.
We calculate multiple hash-sums over multiple pre-images in parallel, rather than going block by block over the input vector.
For example, for Poseidon2 of width 16, input rate 8, output elements 8 and input of size 1024 * 8, we would expect 1024 * 8 elements of output. Which means each input block would be of size 8, resulting in 1024 Poseidon2 hashes being performed.
### Supported Bindings
[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon2)
### Constants
Poseidon2 is also extremely customizable and using different constants will produce different hashes, security levels and performance results.
We support pre-calculated constants for each of the [supported curves](../core#supported-curves-and-operations). The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon2/constants) and are labeled clearly per curve `<curve_name>_poseidon2.h`.
You can also use your own set of constants as shown [here](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs#L290)
### Rust API
This is the most basic way to use the Poseidon2 API.
```rust
let test_size = 1 << 10;
let width = 16;
let rate = 8;
let ctx = get_default_device_context();
let poseidon = Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
let config = HashConfig::default();
let inputs = vec![F::one(); test_size * rate as usize];
let outputs = vec![F::zero(); test_size];
let mut input_slice = HostOrDeviceSlice::on_host(inputs);
let mut output_slice = HostOrDeviceSlice::on_host(outputs);
poseidon.hash_many::<F>(
&mut input_slice,
&mut output_slice,
test_size as u32,
rate as u32,
8, // Output length
&config,
)
.unwrap();
```
In the example above `Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();` is used to load the correct constants based on width and curve. Here, the default MDS matrices and diffusion are used. If you want to get a Plonky3 compliant version, set them to `MdsType::Plonky` and `DiffusionStrategy::Montgomery` respectively.
## The Tree Builder
Similar to Poseidon1, you can use Poseidon2 in a tree builder.
```rust
use icicle_bn254::tree::Bn254TreeBuilder;
use icicle_bn254::poseidon2::Poseidon2;
let mut config = TreeBuilderConfig::default();
let arity = 2;
config.arity = arity as u32;
let input_block_len = arity;
let leaves = vec![F::one(); (1 << height) * arity];
let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];
let leaves_slice = HostSlice::from_slice(&leaves);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
let ctx = device_context::DeviceContext::default();
let hash = Poseidon2::load(arity, arity, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
let mut config = TreeBuilderConfig::default();
config.keep_rows = 5;
Bn254TreeBuilder::build_merkle_tree(
leaves_slice,
digests_slice,
height,
input_block_len,
&hash,
&hash,
&config,
)
.unwrap();
```

19
docs/package-lock.json generated
View File

@@ -3680,6 +3680,8 @@
"version": "8.12.0",
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
"integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
"optional": true,
"peer": true,
"dependencies": {
"fast-deep-equal": "^3.1.1",
"json-schema-traverse": "^1.0.0",
@@ -3694,7 +3696,9 @@
"node_modules/ajv-formats/node_modules/json-schema-traverse": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
"optional": true,
"peer": true
},
"node_modules/ajv-keywords": {
"version": "3.5.2",
@@ -16340,14 +16344,13 @@
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz",
"integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==",
"requires": {
"ajv": "^8.0.0"
},
"requires": {},
"dependencies": {
"ajv": {
"version": "8.12.0",
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
"version": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
"integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
"optional": true,
"peer": true,
"requires": {
"fast-deep-equal": "^3.1.1",
"json-schema-traverse": "^1.0.0",
@@ -16358,7 +16361,9 @@
"json-schema-traverse": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
"optional": true,
"peer": true
}
}
},

View File

@@ -53,6 +53,11 @@ module.exports = {
label: "Poseidon Hash",
id: "icicle/primitives/poseidon",
},
{
type: "doc",
label: "Poseidon2 Hash",
id: "icicle/primitives/poseidon2",
},
],
},
{

View File

@@ -19,5 +19,5 @@ add_executable(
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bls12_377.a)
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -10,9 +10,6 @@ mkdir -p build/icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON
cmake --build build/icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bls12_377 -DG2=ON
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -3,23 +3,9 @@
#include <iomanip>
#include "api/bn254.h"
#include "api/bls12_377.h"
using namespace bn254;
// using namespace bn254;
typedef bn254::scalar_t scalar_bn254;
typedef bn254::affine_t affine_bn254;
typedef bn254::g2_affine_t g2_affine_bn254;
typedef bn254::projective_t projective_bn254;
typedef bn254::g2_projective_t g2_projective_bn254;
typedef bls12_377::scalar_t scalar_bls12377;
typedef bls12_377::affine_t affine_bls12377;
typedef bls12_377::g2_affine_t g2_affine_bls12377;
typedef bls12_377::projective_t projective_bls12377;
typedef bls12_377::g2_projective_t g2_projective_bls12377;
int msm_bn254(int argc, char* argv[])
int main(int argc, char* argv[])
{
std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
std::cout << "Example parameters" << std::endl;
@@ -32,11 +18,11 @@ int msm_bn254(int argc, char* argv[])
std::cout << "Part I: use G1 points" << std::endl;
std::cout << "Generating random inputs on-host" << std::endl;
scalar_bn254* scalars = new scalar_bn254[N];
affine_bn254* points = new affine_bn254[N];
projective_bn254 result;
scalar_bn254::rand_host_many(scalars, N);
projective_bn254::rand_host_many_affine(points, N);
scalar_t* scalars = new scalar_t[N];
affine_t* points = new affine_t[N];
projective_t result;
scalar_t::rand_host_many(scalars, N);
projective_t::rand_host_many_affine(points, N);
std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
device_context::DeviceContext ctx = device_context::get_default_device_context();
@@ -62,17 +48,17 @@ int msm_bn254(int argc, char* argv[])
cudaStream_t stream = config.ctx.stream;
// Execute the MSM kernel
bn254_msm_cuda(scalars, points, msm_size, config, &result);
std::cout << projective_bn254::to_affine(result) << std::endl;
std::cout << projective_t::to_affine(result) << std::endl;
std::cout << "Copying inputs on-device" << std::endl;
scalar_bn254* scalars_d;
affine_bn254* points_d;
projective_bn254* result_d;
cudaMalloc(&scalars_d, sizeof(scalar_bn254) * N);
cudaMalloc(&points_d, sizeof(affine_bn254) * N);
cudaMalloc(&result_d, sizeof(projective_bn254));
cudaMemcpy(scalars_d, scalars, sizeof(scalar_bn254) * N, cudaMemcpyHostToDevice);
cudaMemcpy(points_d, points, sizeof(affine_bn254) * N, cudaMemcpyHostToDevice);
scalar_t* scalars_d;
affine_t* points_d;
projective_t* result_d;
cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
cudaMalloc(&points_d, sizeof(affine_t) * N);
cudaMalloc(&result_d, sizeof(projective_t));
cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
cudaMemcpy(points_d, points, sizeof(affine_t) * N, cudaMemcpyHostToDevice);
std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
config.are_results_on_device = true;
@@ -84,9 +70,9 @@ int msm_bn254(int argc, char* argv[])
bn254_msm_cuda(scalars_d, points_d, msm_size, config, result_d);
// Copy the result back to the host
cudaMemcpy(&result, result_d, sizeof(projective_bn254), cudaMemcpyDeviceToHost);
cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
// Print the result
std::cout << projective_bn254::to_affine(result) << std::endl;
std::cout << projective_t::to_affine(result) << std::endl;
// Free the device memory
cudaFree(scalars_d);
cudaFree(points_d);
@@ -98,25 +84,25 @@ int msm_bn254(int argc, char* argv[])
std::cout << "Generating random inputs on-host" << std::endl;
// use the same scalars
g2_affine_bn254* g2_points = new g2_affine_bn254[N];
g2_projective_bn254::rand_host_many_affine(g2_points, N);
g2_affine_t* g2_points = new g2_affine_t[N];
g2_projective_t::rand_host_many_affine(g2_points, N);
std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
config.are_results_on_device = false;
config.are_scalars_on_device = false;
config.are_points_on_device = false;
g2_projective_bn254 g2_result;
g2_projective_t g2_result;
bn254_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
std::cout << g2_projective_bn254::to_affine(g2_result) << std::endl;
std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
std::cout << "Copying inputs on-device" << std::endl;
g2_affine_bn254* g2_points_d;
g2_projective_bn254* g2_result_d;
cudaMalloc(&scalars_d, sizeof(scalar_bn254) * N);
cudaMalloc(&g2_points_d, sizeof(g2_affine_bn254) * N);
cudaMalloc(&g2_result_d, sizeof(g2_projective_bn254));
cudaMemcpy(scalars_d, scalars, sizeof(scalar_bn254) * N, cudaMemcpyHostToDevice);
cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_bn254) * N, cudaMemcpyHostToDevice);
g2_affine_t* g2_points_d;
g2_projective_t* g2_result_d;
cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);
std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
config.are_results_on_device = true;
@@ -125,140 +111,14 @@ int msm_bn254(int argc, char* argv[])
std::cout << "Running MSM kernel with on-device inputs" << std::endl;
bn254_g2_msm_cuda(scalars_d, g2_points_d, msm_size, config, g2_result_d);
cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_bn254), cudaMemcpyDeviceToHost);
std::cout << g2_projective_bn254::to_affine(g2_result) << std::endl;
cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
cudaFree(scalars_d);
cudaFree(g2_points_d);
cudaFree(g2_result_d);
delete[] g2_points;
delete[] scalars;
return 0;
}
int msm_bls12_377(int argc, char* argv[])
{
std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
std::cout << "Example parameters" << std::endl;
int batch_size = 1;
std::cout << "Batch size: " << batch_size << std::endl;
unsigned msm_size = 1048576;
std::cout << "MSM size: " << msm_size << std::endl;
int N = batch_size * msm_size;
std::cout << "Part I: use G1 points" << std::endl;
std::cout << "Generating random inputs on-host" << std::endl;
scalar_bls12377* scalars = new scalar_bls12377[N];
affine_bls12377* points = new affine_bls12377[N];
projective_bls12377 result;
scalar_bls12377::rand_host_many(scalars, N);
projective_bls12377::rand_host_many_affine(points, N);
std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
device_context::DeviceContext ctx = device_context::get_default_device_context();
msm::MSMConfig config = {
ctx, // ctx
0, // points_size
1, // precompute_factor
0, // c
0, // bitsize
10, // large_bucket_factor
1, // batch_size
false, // are_scalars_on_device
false, // are_scalars_montgomery_form
false, // are_points_on_device
false, // are_points_montgomery_form
false, // are_results_on_device
false, // is_big_triangle
false, // is_async
};
config.batch_size = batch_size;
std::cout << "Running MSM kernel with on-host inputs" << std::endl;
cudaStream_t stream = config.ctx.stream;
cudaStreamCreate(&stream);
// Execute the MSM kernel
bls12_377_msm_cuda(scalars, points, msm_size, config, &result);
std::cout << projective_bls12377::to_affine(result) << std::endl;
std::cout << "Copying inputs on-device" << std::endl;
scalar_bls12377* scalars_d_bls;
affine_bls12377* points_d_bls;
projective_bls12377* result_d_bls;
cudaMalloc(&scalars_d_bls, sizeof(scalar_bls12377) * N);
cudaMalloc(&points_d_bls, sizeof(affine_bls12377) * N);
cudaMalloc(&result_d_bls, sizeof(projective_bls12377));
cudaMemcpy(scalars_d_bls, scalars, sizeof(scalar_bls12377) * N, cudaMemcpyHostToDevice);
cudaMemcpy(points_d_bls, points, sizeof(affine_bls12377) * N, cudaMemcpyHostToDevice);
std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
config.are_results_on_device = true;
config.are_scalars_on_device = true;
config.are_points_on_device = true;
std::cout << "Running MSM kernel with on-device inputs" << std::endl;
// Execute the MSM kernel
bls12_377_msm_cuda(scalars_d_bls, points_d_bls, msm_size, config, result_d_bls);
// Copy the result back to the host
cudaMemcpy(&result, result_d_bls, sizeof(projective_bls12377), cudaMemcpyDeviceToHost);
// Print the result
std::cout << projective_bls12377::to_affine(result) << std::endl;
// Free the device memory
cudaFree(scalars_d_bls);
cudaFree(points_d_bls);
cudaFree(result_d_bls);
// Free the host memory, keep scalars for G2 example
delete[] points;
std::cout << "Part II: use G2 points" << std::endl;
std::cout << "Generating random inputs on-host" << std::endl;
// use the same scalars
g2_affine_bls12377* g2_points = new g2_affine_bls12377[N];
g2_projective_bls12377::rand_host_many_affine(g2_points, N);
std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
config.are_results_on_device = false;
config.are_scalars_on_device = false;
config.are_points_on_device = false;
g2_projective_bls12377 g2_result;
bls12_377_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
std::cout << g2_projective_bls12377::to_affine(g2_result) << std::endl;
std::cout << "Copying inputs on-device" << std::endl;
g2_affine_bls12377* g2_points_d;
g2_projective_bls12377* g2_result_d;
cudaMalloc(&scalars_d_bls, sizeof(scalar_bls12377) * N);
cudaMalloc(&g2_points_d, sizeof(g2_affine_bls12377) * N);
cudaMalloc(&g2_result_d, sizeof(g2_projective_bls12377));
cudaMemcpy(scalars_d_bls, scalars, sizeof(scalar_bls12377) * N, cudaMemcpyHostToDevice);
cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_bls12377) * N, cudaMemcpyHostToDevice);
std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
config.are_results_on_device = true;
config.are_scalars_on_device = true;
config.are_points_on_device = true;
std::cout << "Running MSM kernel with on-device inputs" << std::endl;
bls12_377_g2_msm_cuda(scalars_d_bls, g2_points_d, msm_size, config, g2_result_d);
cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_bn254), cudaMemcpyDeviceToHost);
std::cout << g2_projective_bls12377::to_affine(g2_result) << std::endl;
cudaFree(scalars_d_bls);
cudaFree(g2_points_d);
cudaFree(g2_result_d);
delete[] g2_points;
delete[] scalars;
return 0;
}
int main(int argc, char* argv[])
{
std::cout << "Starting BN254 MSM" << std::endl;
msm_bn254(argc, argv);
std::cout << "Starting BLS12-377 MSM" << std::endl;
msm_bls12_377(argc, argv);
cudaStreamDestroy(stream);
return 0;
}

View File

@@ -18,7 +18,7 @@ add_executable(
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bls12_377.a)
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -7,13 +7,9 @@ mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bls12_377
cmake --build build/icicle
rm build/icicle/CMakeCache.txt
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -4,17 +4,14 @@
#include <nvml.h>
#include "api/bn254.h"
#include "api/bls12_377.h"
#include "vec_ops/vec_ops.cuh"
using namespace vec_ops;
// using namespace bn254;
typedef bn254::scalar_t T;
using namespace bn254;
typedef bls12_377::scalar_t T_bls;
typedef scalar_t T;
int vector_mult_bn254(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
{
vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
config.is_a_on_device = true;
@@ -28,24 +25,10 @@ int vector_mult_bn254(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, devic
return 0;
}
int vector_mult_bls12377(T_bls* vec_b, T_bls* vec_a, T_bls* vec_result, size_t n_elments, device_context::DeviceContext ctx)
{
vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
config.is_a_on_device = true;
config.is_b_on_device = true;
config.is_result_on_device = true;
cudaError_t err = bls12_377_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
if (err != cudaSuccess) {
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
return 0;
}
return 0;
}
int main(int argc, char** argv)
{
const unsigned vector_size = 1 << 15;
const unsigned repetitions = 1 ;
const unsigned repetitions = 1 << 15;
cudaError_t err;
nvmlInit();
@@ -78,53 +61,41 @@ int main(int argc, char** argv)
// host data
T* host_in1 = (T*)malloc(vector_size * sizeof(T));
T* host_in2 = (T*)malloc(vector_size * sizeof(T));
T_bls* host_in1_bls12377 = (T_bls*)malloc(vector_size * sizeof(T_bls));
T_bls* host_in2_bls12377 = (T_bls*)malloc(vector_size * sizeof(T_bls));
std::cout << "Initializing vectors with random data" << std::endl;
T::rand_host_many(host_in1, vector_size);
T::rand_host_many(host_in2, vector_size);
T_bls::rand_host_many(host_in1_bls12377, vector_size);
T_bls::rand_host_many(host_in2_bls12377, vector_size);
// device data
device_context::DeviceContext ctx = device_context::get_default_device_context();
T* device_in1_bn254;
T* device_in2_bn254;
T* device_out_bn254;
T_bls* device_in1_bls12377;
T_bls* device_in2_bls12377;
T_bls* device_out_bls12377;
T* device_in1;
T* device_in2;
T* device_out;
err = cudaMalloc((void**)&device_in1_bn254, vector_size * sizeof(T));
err = cudaMalloc((void**)&device_in1_bls12377, vector_size * sizeof(T_bls));
err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_in2_bn254, vector_size * sizeof(T));
err = cudaMalloc((void**)&device_in2_bls12377, vector_size * sizeof(T_bls));
err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_out_bn254, vector_size * sizeof(T));
err = cudaMalloc((void**)&device_out_bls12377, vector_size * sizeof(T_bls));
err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
// copy from host to device
err = cudaMemcpy(device_in1_bn254, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
err = cudaMemcpy(device_in1_bls12377, host_in1_bls12377, vector_size * sizeof(T_bls), cudaMemcpyHostToDevice);
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMemcpy(device_in2_bn254, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
err = cudaMemcpy(device_in2_bls12377, host_in2_bls12377, vector_size * sizeof(T_bls), cudaMemcpyHostToDevice);
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
@@ -133,10 +104,7 @@ int main(int argc, char** argv)
std::cout << "Starting warm-up" << std::endl;
// Warm-up loop
for (int i = 0; i < repetitions; i++) {
std::cout << "bn254 mult" << std::endl;
vector_mult_bn254(device_in1_bn254, device_in2_bn254, device_out_bn254, vector_size, ctx);
std::cout << "bls12-377 mult" << std::endl;
vector_mult_bls12377(device_in1_bls12377, device_in2_bls12377, device_out_bls12377, vector_size, ctx);
vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
}
std::cout << "Starting benchmarking" << std::endl;
@@ -154,7 +122,7 @@ int main(int argc, char** argv)
auto start_time = std::chrono::high_resolution_clock::now();
// Benchmark loop
for (int i = 0; i < repetitions; i++) {
vector_mult_bn254(device_in1_bn254, device_in2_bn254, device_out_bn254, vector_size, ctx);
vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
@@ -178,7 +146,7 @@ int main(int argc, char** argv)
// Optional: validate multiplication
T* host_out = (T*)malloc(vector_size * sizeof(T));
cudaMemcpy(host_out, device_out_bn254, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
// validate multiplication here...
@@ -186,9 +154,9 @@ int main(int argc, char** argv)
free(host_in1);
free(host_in2);
free(host_out);
cudaFree(device_in1_bn254);
cudaFree(device_in2_bn254);
cudaFree(device_out_bn254);
cudaFree(device_in1);
cudaFree(device_in2);
cudaFree(device_out);
nvmlShutdown();
return 0;
}

View File

@@ -11,6 +11,17 @@
using namespace hash;
namespace keccak {
// Input rate in bytes
const int KECCAK_256_RATE = 136;
const int KECCAK_512_RATE = 72;
// Digest size in u64
const int KECCAK_256_DIGEST = 4;
const int KECCAK_512_DIGEST = 8;
// Number of state elements in u64
const int KECCAK_STATE_SIZE = 25;
class Keccak : public Hasher<uint8_t, uint64_t>
{
public:
@@ -22,7 +33,7 @@ namespace keccak {
unsigned int output_len,
const device_context::DeviceContext& ctx) const override;
Keccak(unsigned int rate) : Hasher<uint8_t, uint64_t>(25, 25, rate, 0) {}
Keccak(unsigned int rate) : Hasher<uint8_t, uint64_t>(KECCAK_STATE_SIZE, KECCAK_STATE_SIZE, rate, 0) {}
};
} // namespace keccak

View File

@@ -11,13 +11,15 @@ namespace keccak {
extern "C" cudaError_t
keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
{
return Keccak(136).hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, 4, config);
return Keccak(KECCAK_256_RATE)
.hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_256_DIGEST, config);
}
extern "C" cudaError_t
keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
{
return Keccak(72).hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, 8, config);
return Keccak(KECCAK_512_RATE)
.hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_512_DIGEST, config);
}
extern "C" cudaError_t build_keccak256_merkle_tree_cuda(
@@ -27,7 +29,7 @@ namespace keccak {
unsigned int input_block_len,
const merkle_tree::TreeBuilderConfig& tree_config)
{
Keccak keccak(136);
Keccak keccak(KECCAK_256_RATE);
return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
leaves, digests, height, input_block_len, keccak, keccak, tree_config);
}
@@ -39,7 +41,7 @@ namespace keccak {
unsigned int input_block_len,
const merkle_tree::TreeBuilderConfig& tree_config)
{
Keccak keccak(72);
Keccak keccak(KECCAK_512_RATE);
return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
leaves, digests, height, input_block_len, keccak, keccak, tree_config);
}

View File

@@ -161,7 +161,7 @@ namespace keccak {
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
__device__ void keccakf(u64 s[25])
__device__ void keccakf(u64 s[KECCAK_STATE_SIZE])
{
u64 t0, t1, t2, t3, t4;
@@ -188,7 +188,7 @@ namespace keccak {
const uint8_t* b_input = input + sid * input_block_size;
uint64_t* b_output = output + sid * output_len;
uint64_t state[25] = {}; // Initialize with zeroes
uint64_t state[KECCAK_STATE_SIZE] = {}; // Initialize with zeroes
int input_len = input_block_size;
@@ -238,12 +238,12 @@ namespace keccak {
int number_of_gpu_blocks = (number_of_states - 1) / number_of_threads + 1;
switch (rate) {
case 136:
keccak_hash_blocks<136><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
case KECCAK_256_RATE:
keccak_hash_blocks<KECCAK_256_RATE><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
input, input_len, output_len, number_of_states, output);
break;
case 72:
keccak_hash_blocks<72><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
case KECCAK_512_RATE:
keccak_hash_blocks<KECCAK_512_RATE><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
input, input_len, output_len, number_of_states, output);
break;
default:

View File

@@ -18,7 +18,7 @@ exclude = [
]
[workspace.package]
version = "2.7.1"
version = "2.8.0"
edition = "2021"
authors = [ "Ingonyama" ]
homepage = "https://www.ingonyama.com"

View File

@@ -24,6 +24,6 @@ pub(crate) mod tests {
let ctx = device_context::DeviceContext::default();
let sponge = Poseidon::load(2, &ctx).unwrap();
check_build_field_merkle_tree::<_, _, Bls12_377TreeBuilder>(25, 2, &sponge, &sponge, ScalarField::zero());
check_build_field_merkle_tree::<_, _, Bls12_377TreeBuilder>(18, 2, &sponge, &sponge, ScalarField::zero());
}
}

View File

@@ -24,6 +24,6 @@ pub(crate) mod tests {
let ctx = device_context::DeviceContext::default();
let sponge = Poseidon::load(2, &ctx).unwrap();
check_build_field_merkle_tree::<_, _, Bls12_381TreeBuilder>(25, 2, &sponge, &sponge, ScalarField::zero());
check_build_field_merkle_tree::<_, _, Bls12_381TreeBuilder>(18, 2, &sponge, &sponge, ScalarField::zero());
}
}