Compare commits

...

199 Commits

Author SHA1 Message Date
hadaringonyama
0a163b85f6 format 2024-08-07 16:38:26 +03:00
hadaringonyama
29f678068d karatsuba try - not good 2024-08-07 16:30:35 +03:00
hadaringonyama
ecd7a0d6e9 add-sub 64 works 2024-08-06 13:33:24 +03:00
hadaringonyama
f7fcd14714 multiplier works 2024-08-01 18:40:19 +03:00
Yuval Shekel
5332f4c8f8 minor update to rust poly example 2024-07-30 17:28:55 +03:00
Yuval Shekel
a45746fc3b fix rust examples calling load_backend() with removed param 2024-07-30 15:36:58 +03:00
Yuval Shekel
4a4b25445d rename template files to not format them 2024-07-30 12:11:27 +03:00
Yuval Shekel
dc1b2fce6f fix: bug where wrong polynomial factory is used to construct polynomial from wrong field 2024-07-30 12:03:57 +03:00
Yuval Shekel
65ca51ca5e update Rust examples to support installed backend 2024-07-28 22:10:00 +03:00
Yuval Shekel
f8661ac0ef update rust examples msm and ntt to support installed backend dir 2024-07-28 20:14:04 +03:00
Yuval Shekel
5d821d3db1 refactor C++ examples to work with install-dir, or build from source and choose device 2024-07-28 19:32:01 +03:00
Yuval Shekel
131e22d3c6 fix example C++ msm, missing G2 flag 2024-07-28 18:32:33 +03:00
Yuval Shekel
10963fbe0c refactor rust example msm 2024-07-28 17:08:12 +03:00
Yuval Shekel
2a3dcd776a fix format 2024-07-28 15:36:30 +03:00
Yuval Shekel
8d03fb6bc8 trigger examples build in ci 2024-07-28 15:33:00 +03:00
Yuval Shekel
445eec88af refactor C++ example risc0 2024-07-28 15:30:48 +03:00
Yuval Shekel
a5a8bf8e3d refactor C++ example poly-api 2024-07-28 15:20:23 +03:00
Yuval Shekel
68802bc14a refactor C++ example polynomial-multiplication for V3 2024-07-28 14:49:02 +03:00
Yuval Shekel
63dcd4ef5d remove multiply c++ example and fixed some readme 2024-07-28 13:01:01 +03:00
Yuval Shekel
c302aebeca refactor pedersen-commintment example to V3 2024-07-28 12:55:12 +03:00
Yuval Shekel
b07d02096f update ntt example for V3 2024-07-28 12:38:50 +03:00
Yuval Shekel
454fe5b922 update ntt-best-practice and msm examples to V3 2024-07-25 18:32:31 +03:00
Yuval Shekel
99e58532ad update readme of C++ msm example 2024-07-25 15:41:58 +03:00
Yuval Shekel
9618e6db74 split backend registration part to backend headers to avoid dep on concrete fields/curves 2024-07-24 20:34:05 +03:00
Yuval Shekel
fc2ac41710 refactor msm example 2024-07-24 20:27:26 +03:00
Yuval Shekel
49beb324a8 add script for generating C apis 2024-07-24 20:12:42 +03:00
Yuval Shekel
b52191f9dd field generation update from V2 2024-07-24 17:08:37 +03:00
Yuval Shekel
05d918fcda let PR choose the cuda-backend branch to use 2024-07-21 17:37:44 +03:00
Yuval Shekel
bdf3cd5672 add info logs to NTT test to find the issue 2024-07-21 11:36:44 +03:00
Yuval Shekel
58780f0115 fix compilation issues for mac 2024-07-18 18:40:01 +03:00
Yuval Shekel
4f47c9d4f3 remove redundant link to stdc++ in build.rs scripts 2024-07-18 18:11:11 +03:00
Yuval Shekel
1baabd93d6 add feature to rust crates to choose local/remote cuda backend 2024-07-18 18:03:42 +03:00
Yuval Shekel
a8e794fdfd use ssh-key in ci workflows to pull cuda backend 2024-07-18 17:27:49 +03:00
Yuval Shekel
934f934976 pull cuda backend when building in ci 2024-07-18 14:25:51 +03:00
Yuval Shekel
cd3353362a move cuda backend to private repo and clone it in ci 2024-07-18 12:41:27 +03:00
Yuval Shekel
afe48317a6 skip fast twiddles for domains smaller than 16 2024-07-18 12:02:22 +03:00
Yuval Shekel
7de94fff84 fix bug in cuda MSM when scalars are in montgomery form 2024-07-17 19:11:47 +03:00
Yuval Shekel
d3b27a7b68 randomize device for C++ polynomial tests 2024-07-17 19:11:47 +03:00
Yuval Shekel
0409c6f5d7 replace msm config bases_size with boolean flag indicating bases are shared for batch elements 2024-07-17 19:11:47 +03:00
Yuval Shekel
9c35aaa5c3 add doxygen comments to frontend APIs 2024-07-17 19:11:47 +03:00
Yuval Shekel
db19c3846d split default poly backend from cuda and support CPU too 2024-07-17 19:11:47 +03:00
Yuval Shekel
06243e5493 add polynomial evaluation api and cuda backend. Use it in poly backend 2024-07-17 19:11:47 +03:00
Yuval Shekel
8ade7ef393 use vector_add in polynomial backend to add monomial inplace 2024-07-17 19:11:47 +03:00
Yuval Shekel
749d3f9b6e add vec_op api for find highest non zero idx of vector, and CUDA backend 2024-07-17 19:11:47 +03:00
Yuval Shekel
0885195619 add vec_ops apis for scalar+-vector and use in polynomial backend 2024-07-17 19:11:47 +03:00
Yuval Shekel
6f8c480fd1 use vector_div in polynomial backend 2024-07-17 19:11:47 +03:00
Yuval Shekel
f13f24cff2 add vec_div api 2024-07-17 19:11:47 +03:00
Yuval Shekel
028f59cef0 use vec_mul in polynomial api insted of another kernel 2024-07-17 19:11:47 +03:00
Yuval Shekel
61af965b41 add api for mul scalar and use in poly backend 2024-07-17 19:11:47 +03:00
Yuval Shekel
091a19354e add slice api to vec ops 2024-07-17 19:11:46 +03:00
Yuval Shekel
0c8a849d7c move default polynomial context from cuda backend to icicle FE 2024-07-17 19:11:46 +03:00
Yuval Shekel
87a25afee7 use icicle runtime apis in polynomial implementation instead of cuda apis 2024-07-17 19:11:46 +03:00
Yuval Shekel
4d0b1a03a3 removed device_id from polynomial view 2024-07-17 19:11:46 +03:00
Yuval Shekel
de88ad144e minor: CPU_REF reuse device_api class like CPU 2024-07-17 19:11:46 +03:00
Yuval Shekel
9b46b8489f api for memset and tests for copy() and memset() 2024-07-17 19:11:46 +03:00
Yuval Shekel
caf3b4fab2 add generic copy and copy_async with auto device inference 2024-07-17 19:11:46 +03:00
Yuval Shekel
e49540de20 polynomial refactor to be multi-device 2024-07-17 19:11:46 +03:00
Yuval Shekel
74f7a3cbcd move tracking logic to runtime.cpp rather than device API 2024-07-17 19:11:46 +03:00
Shanie Winitz
36c879912a NTT simple cpu backend (#533) 2024-07-17 19:11:46 +03:00
Yuval Shekel
ca7fb84cf0 switch device when releasing memory of inactive device 2024-07-17 19:11:46 +03:00
Yuval Shekel
76cf23d02a ecntt bug fix from V2 2024-07-17 19:11:46 +03:00
Yuval Shekel
5a7c0ccd22 print domain size when ntt fails on that. same domain size for ntt and ecntt to avoid issues 2024-07-17 19:11:46 +03:00
Yuval Shekel
ec596c3372 rust polynomials 2024-07-17 19:11:46 +03:00
Yuval Shekel
be7cbbac89 polynomial tests fix regarding msm 2024-07-17 19:11:46 +03:00
Yuval Shekel
ef85fccd08 ecntt bug fix from v2 2024-07-17 19:11:46 +03:00
Yuval Shekel
40693a0e3e polynomial API V3 2024-07-17 19:11:46 +03:00
Yuval Shekel
6b095e6e29 track memory allocations via MemoryTracker and query pointer to device runtime APIs 2024-07-17 19:11:46 +03:00
Yuval Shekel
b8e9f90e87 fix field tests compilation and find nvcc via cmake 2024-07-17 19:11:46 +03:00
Yuval Shekel
8a6cf0258b msm test multi-device bug fix: allocate stream on correct device 2024-07-17 19:11:46 +03:00
Yuval Shekel
3e50854b61 field and curve host arithmetic tests 2024-07-17 19:11:46 +03:00
Yuval Shekel
fe28f8f160 reintroduced ntt tests against risc0 and lambdaworks 2024-07-17 19:11:46 +03:00
Yuval Shekel
1146f5620a rust ci merge build and test for faster ci 2024-07-17 19:11:46 +03:00
Yuval Shekel
401b5b8b33 bit reverse in rust 2024-07-17 19:11:46 +03:00
Yuval Shekel
48269e3a8e bit reverse api and test, including cpu and cuda backends 2024-07-17 19:11:46 +03:00
Yuval Shekel
f23051967d missing MSM cuda test 2024-07-17 19:11:46 +03:00
Yuval Shekel
7ed3cc71ce matrix ops consolidated to vec ops, some TODO fixed 2024-07-17 19:11:46 +03:00
Yuval Shekel
763b736c46 msm for cuda backend 2024-07-17 19:11:46 +03:00
Yuval Shekel
f2236562f2 rust msm 2024-07-17 19:11:45 +03:00
Yuval Shekel
45c0e3e4b9 rust bls12-377 build.rs fix 2024-07-17 19:11:45 +03:00
Yuval Shekel
2da454aa84 rust grumpkin curve 2024-07-17 19:11:45 +03:00
Yuval Shekel
0cdfb84a65 bw6-761 rust crate 2024-07-17 19:11:45 +03:00
Yuval Shekel
6578125a88 rust bls curves 2024-07-17 19:11:45 +03:00
Yuval Shekel
e14be6fc5b rust ecntt 2024-07-17 19:11:45 +03:00
Yuval Shekel
d9248b7110 bn254 curve crate 2024-07-17 19:11:45 +03:00
Yuval Shekel
86b757a6ce stark252 rust crate 2024-07-17 19:11:45 +03:00
Yuval Shekel
6627b7b5f5 rust missing ntt tests 2024-07-17 19:11:45 +03:00
Yuval Shekel
3072f6ce21 test utilities to facilitate loading backends and choosing devices atomically for all test suites 2024-07-17 19:11:45 +03:00
Yuval Shekel
29c83c7453 eplaced runtime_errors with icicle macro for throwing exceptions 2024-07-17 19:11:45 +03:00
Yuval Shekel
de650b8784 rust v3 ntt 2024-07-17 19:11:45 +03:00
Yuval Shekel
e67ac8608f rust matrix transpose 2024-07-17 19:11:45 +03:00
Yuval Shekel
df290c07a4 rust v3 vec ops, api to list registered devices and config-extension clone 2024-07-17 19:11:45 +03:00
Yuval Shekel
ffc68c25e0 wrap() errors and install icicle into the deps cargo build dir 2024-07-17 19:11:45 +03:00
Yuval Shekel
59ff2d9a11 refactored rust vec_ops config to avoid taking ownership of stream but work with ffi 2024-07-17 19:11:45 +03:00
Yuval Shekel
ca4281dafc refactored device class to own the type string 2024-07-17 19:11:45 +03:00
Yuval Shekel
a1c0c8eed8 rust icicle-core vec_ops config and default device API 2024-07-17 19:11:45 +03:00
Yuval Shekel
d8115c0404 ConfigExtension rust wrapper 2024-07-17 19:11:45 +03:00
Yuval Shekel
754f7bc2ae ConfigExtension is now pointer in config for easier bindings 2024-07-17 19:11:45 +03:00
Yuval Shekel
5c85599ac2 rust v3 babybear crate 2024-07-17 19:11:45 +03:00
Yuval Shekel
15abeb93c2 rust v3 icicle-core crate 2024-07-17 19:11:45 +03:00
Yuval Shekel
a86ebb47ec simple v3 rust example 2024-07-17 19:11:45 +03:00
Yuval Shekel
25d53e86f4 small cmake refactor, and install target 2024-07-17 19:11:45 +03:00
Yuval Shekel
c5b75f7868 minor rust ci fix 2024-07-17 19:11:45 +03:00
Yuval Shekel
42f774abbd fix identification of changed files 2024-07-17 19:11:45 +03:00
Yuval Shekel
ff6306c7a7 rust ci 2024-07-17 19:11:45 +03:00
Yuval Shekel
acb49eb598 rust runtime crate async ops 2024-07-17 19:11:45 +03:00
Yuval Shekel
9ae8d44cf1 rust runtime crate streams 2024-07-17 19:11:44 +03:00
Yuval Shekel
83cdf4145c rust runtime crate copy to/from device test 2024-07-17 19:11:44 +03:00
Yuval Shekel
3b92f60372 rust runtime crate progress 2024-07-17 19:11:44 +03:00
Yuval Shekel
da007d112e mac compilation fix and api to query device availability 2024-07-17 19:11:44 +03:00
Yuval Shekel
3309044a8c rust runtime crate 2024-07-17 19:11:44 +03:00
Yuval Shekel
baae668edf fixed cuda ntt headers 2024-07-17 19:11:44 +03:00
Yuval Shekel
59843ee2b1 no ECNTT for grumpkin 2024-07-17 19:11:44 +03:00
Yuval Shekel
77eb89fc2e get_device_properties() added to device api 2024-07-17 19:11:44 +03:00
Yuval Shekel
6fe6916050 CUDA ecntt 2024-07-17 19:11:44 +03:00
Yuval Shekel
091c9ba616 montgomery conversion for cuda 2024-07-17 19:11:44 +03:00
Yuval Shekel
41962b6d23 icicle curve links to icicle_field 2024-07-17 19:11:44 +03:00
Yuval Shekel
42490afdad fix order of loaded libs 2024-07-17 19:11:44 +03:00
Yuval Shekel
aea2dba1f0 CUDA ntt 2024-07-17 19:11:44 +03:00
Yuval Shekel
0e4af8f3ca montgomery conversion CUDA 2024-07-17 19:11:44 +03:00
Yuval Shekel
039065082a transpose and montgomery CUDA apis 2024-07-17 19:11:44 +03:00
Yuval Shekel
c94bcad4f6 CUDA vector ops 2024-07-17 19:11:44 +03:00
Yuval Shekel
6671c3b391 define options in main cmake 2024-07-17 19:11:44 +03:00
Yuval Shekel
4e7b8869da ecntt api 2024-07-17 19:11:44 +03:00
Yuval Shekel
5b5fbfb9ad avoid template specialization for curves where g2_affine is same type as affine 2024-07-17 19:11:44 +03:00
Yuval Shekel
c04a3ab182 github workflow bug fix 2024-07-17 19:11:44 +03:00
Yuval Shekel
e2237ec34d montgomery conversion G2 2024-07-17 19:11:44 +03:00
Yuval Shekel
4318aa1072 MSM G2 2024-07-17 19:11:44 +03:00
Yuval Shekel
729d1c0fd9 montgomery conversion G1 2024-07-17 19:11:44 +03:00
Yuval Shekel
632fb91a4b missing G2 ifdef 2024-07-17 19:11:44 +03:00
Yuval Shekel
bd436f2c00 missing matrix transpose for ext field and moved generate scalars api 2024-07-17 19:11:44 +03:00
Yuval Shekel
954b0e1891 ffi extern curve methods 2024-07-17 19:11:44 +03:00
Yuval Shekel
212068c196 icicle_device links to dl 2024-07-17 19:11:44 +03:00
Yuval Shekel
3c028bf44a msm precompute bases API 2024-07-17 19:11:44 +03:00
Yuval Shekel
e524db39d8 introduce log mechanism 2024-07-17 19:11:43 +03:00
Yuval Shekel
d3d296699d missing ext in msm config 2024-07-17 19:11:43 +03:00
Yuval Shekel
b43518138c fix symbol redefinition when registering an API for multiple backends in same unit 2024-07-17 19:11:43 +03:00
Yuval Shekel
53d73d3e69 disable NTT for unsupported fields 2024-07-17 19:11:43 +03:00
Yuval Shekel
d55dc491d0 curve tests should only be built for curves, not field 2024-07-17 19:11:43 +03:00
Yuval Shekel
87f725da9f msm API now returns projective type 2024-07-17 19:11:43 +03:00
Yuval Shekel
4faace4d44 small refactor to curve headers to avoid linkage errors with external libs 2024-07-17 19:11:43 +03:00
Yuval Shekel
aab5d0d6b5 macro for unique identifiers 2024-07-17 19:11:43 +03:00
Yuval Shekel
4f8fe8387f format files 2024-07-17 19:11:43 +03:00
Yuval Shekel
e5b12bbd50 fix mac linkage issue due to missing linkage to icicle_device 2024-07-17 19:11:43 +03:00
Yuval Shekel
0e0470d20a api to load backend libraries 2024-07-17 19:11:43 +03:00
Yuval Shekel
87323a68d5 msm api with issue for result type 2024-07-17 19:11:43 +03:00
Yuval Shekel
874ebf569b icicle and backends are now shared libs 2024-07-17 19:11:43 +03:00
Yuval Shekel
6d0b326f0b curve lib and test 2024-07-17 19:11:43 +03:00
Yuval Shekel
fe12616c5e extension field vector ops and ntt APIs 2024-07-17 19:11:43 +03:00
Yuval Shekel
eb5a6f95d8 removed field specific APIs from shared headers. Will move the field specific headers 2024-07-17 19:11:43 +03:00
Yuval Shekel
2b4a4c22e7 vec ops size i32->u64 2024-07-17 19:11:43 +03:00
Yuval Shekel
c2267714ad scalar_convert_montgomery() API 2024-07-17 19:11:43 +03:00
Yuval Shekel
c1e2a55919 generate_scalars() API 2024-07-17 19:11:43 +03:00
Yuval Shekel
c4c7744648 ntt init/release domain for cpu backend and test 2024-07-17 19:11:43 +03:00
Yuval Shekel
ba2a03fbc1 ntt release domain API 2024-07-17 19:11:43 +03:00
Yuval Shekel
c07eb08587 ntt init domain API 2024-07-17 19:11:43 +03:00
Yuval Shekel
eda61b2e28 matrix transpose 2024-07-17 19:11:43 +03:00
Yuval Shekel
e40c661dee removed extern C from backend APIs to avoid linkage issues 2024-07-17 19:11:43 +03:00
Yuval Shekel
42b659b2c6 removed redundant includes 2024-07-17 19:11:43 +03:00
Yuval Shekel
b1abf09896 refactored apis to be template with specialization 2024-07-17 19:11:43 +03:00
Yuval Shekel
0216068892 removed include/icicle from incdir to force include with icicle prefix 2024-07-17 19:11:43 +03:00
Yuval Shekel
7068723fad config extension 2024-07-17 19:11:43 +03:00
Yuval Shekel
4fcd5eb35c libs are built with names based on the field/curve 2024-07-17 19:11:43 +03:00
Yuval Shekel
608880db96 generic dispatcher class 2024-07-17 19:11:43 +03:00
Yuval Shekel
3ae2799cde add vec-sub and vec-mul apis 2024-07-17 19:11:42 +03:00
Yuval Shekel
0a48a5d49a refactored to snake case 2024-07-17 19:11:42 +03:00
Yuval Shekel
37e4fafad9 cmake update to force load on Clang and AppleClang too 2024-07-17 19:11:42 +03:00
Yuval Shekel
ade9651ba0 ntt and empty CPU backend 2024-07-17 19:11:42 +03:00
Yuval Shekel
0dcece404e CPU vec ops is template 2024-07-17 19:11:42 +03:00
Yuval Shekel
8b8d778294 use CONCAT_EXPAND() macro to generate symbols per field/curve 2024-07-17 19:11:42 +03:00
Yuval Shekel
ff1017e38b renamed icicle_fe to icicle_v3 2024-07-17 19:11:42 +03:00
Yuval Shekel
513b45f5cb ci compilation fix 2024-07-17 19:11:42 +03:00
Yuval Shekel
b50d6aa3cb spelling 2024-07-17 19:11:42 +03:00
Yuval Shekel
e4e1b3f8d7 refactored runtime APIs to use thread local device 2024-07-17 19:11:42 +03:00
Yuval Shekel
8d9c2b1014 cuda backend for vector add + test CPU vs CUDA 2024-07-17 19:11:42 +03:00
Yuval Shekel
d15edcb60e add config struct to vec ops 2024-07-17 19:11:42 +03:00
Yuval Shekel
fdec847061 vectorAdd operates on Field type 2024-07-17 19:11:42 +03:00
Yuval Shekel
7ade66b977 vectorAdd api for int type 2024-07-17 19:11:42 +03:00
Yuval Shekel
0af022ad49 verify valid device 2024-07-17 19:11:42 +03:00
Yuval Shekel
d5aac95ff9 CUDA backend device 2024-07-17 19:11:42 +03:00
Yuval Shekel
5bbdb0f501 device API tests: async, error, invalid device 2024-07-17 19:11:42 +03:00
Yuval Shekel
755a44d7e8 CPU backend implemented with basic test 2024-07-17 19:11:42 +03:00
Yuval Shekel
b3fad0fff3 mock cpu backend for device API 2024-07-17 19:11:42 +03:00
Yuval Shekel
d7f86e6942 errors and device_api interface 2024-07-17 19:11:42 +03:00
Yuval Shekel
5410d072ee basic cmake scripts and test 2024-07-17 19:11:42 +03:00
Yuval Shekel
4fa154dceb enable CI for V3 2024-07-17 19:11:42 +03:00
release-bot
aacec3f72f Bump rust crates' version
icicle-babybear@2.8.0
icicle-bls12-377@2.8.0
icicle-bls12-381@2.8.0
icicle-bn254@2.8.0
icicle-bw6-761@2.8.0
icicle-core@2.8.0
icicle-cuda-runtime@2.8.0
icicle-grumpkin@2.8.0
icicle-hash@2.8.0
icicle-m31@2.8.0
icicle-stark252@2.8.0

Generated by cargo-workspaces
2024-07-16 13:57:56 +00:00
ChickenLover
a8fa05d0e3 Feat/roman/hash docs (#556)
## Describe the changes

This PR...

## Linked Issues

Resolves #

---------

Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
2024-07-16 16:39:35 +03:00
ChickenLover
ea71faf1fa add keccak tree builder (#555) 2024-07-15 15:31:12 +07:00
ChickenLover
7fd9ed1b49 Feat/roman/tree builder (#525)
# Updates:

## Hashing

 - Added SpongeHasher class
 - Can be used to accept any hash function as an argument
 - Absorb and squeeze are now separated
- Memory management is now mostly done by SpongeHasher class, each hash
function only describes permutation kernels

## Tree builder

 - Tree builder is now hash-agnostic. 
 - Tree builder now supports 2D input (matrices)
- Tree builder can now use two different hash functions for layer 0 and
compression layers

## Poseidon1

 - Interface changed to classes
 - Now allows for any alpha
 - Now allows passing constants not in a single vector
 - Now allows for any domain tag
 - Constants are now released upon going out of scope
 - Rust wrappers changed to Poseidon struct
 
 ## Poseidon2
 
 - Interface changed to classes
 - Constants are now released upon going out of scope
 - Rust wrappers changed to Poseidon2 struct
 
## Keccak

 - Added Keccak class which inherits SpongeHasher
 - Now doesn't use gpu registers for storing states
 
 To do:
- [x] Update poseidon1 golang bindings
- [x] Update poseidon1 examples
- [x] Fix poseidon2 cuda test
- [x] Fix poseidon2 merkle tree builder test
- [x] Update keccak class with new design
- [x] Update keccak test
- [x] Check keccak correctness
- [x] Update tree builder rust wrappers
- [x] Leave doc comments

Future work:  
- [ ] Add keccak merkle tree builder externs
- [ ] Add keccak rust tree builder wrappers
- [ ] Write docs
- [ ] Add example
- [ ] Fix device output for tree builder

---------

Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
Co-authored-by: nonam3e <71525212+nonam3e@users.noreply.github.com>
2024-07-11 13:46:25 +07:00
DmytroTym
2d4059c61f Field creation automated through macros (#551)
Params files for fields now only require modulus specified by the user
(also twiddle generator and/or non-residue in case either or both are
needed). Everything else gets generated by a macro.
2024-07-08 10:39:50 +03:00
release-bot
73cd4c0a99 Bump rust crates' version
icicle-babybear@2.7.1
icicle-bls12-377@2.7.1
icicle-bls12-381@2.7.1
icicle-bn254@2.7.1
icicle-bw6-761@2.7.1
icicle-core@2.7.1
icicle-cuda-runtime@2.7.1
icicle-grumpkin@2.7.1
icicle-hash@2.7.1
icicle-m31@2.7.1
icicle-stark252@2.7.1

Generated by cargo-workspaces
2024-07-04 12:34:26 +00:00
yshekel
5516320ad7 fix large (>512 elements) ecntt issue (#553)
This PR solves an issue for large ecntt where cuda blocks are too large
and cannot be assigned to SMs. The fix is to reduce thread count per
block and increase block count in that case.
2024-07-04 15:33:49 +03:00
Vlad
a4b1eb3de9 Fix affine to projective zero point bug (#552)
## Describe the changes

This PR fixes affine to projective functions in bindings by adding a
condition if the point in affine form is zero then return the projective zero

---------

Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
2024-07-04 09:31:59 +03:00
release-bot
31083463be Bump rust crates' version
icicle-babybear@2.7.0
icicle-bls12-377@2.7.0
icicle-bls12-381@2.7.0
icicle-bn254@2.7.0
icicle-bw6-761@2.7.0
icicle-core@2.7.0
icicle-cuda-runtime@2.7.0
icicle-grumpkin@2.7.0
icicle-hash@2.7.0
icicle-m31@2.7.0
icicle-stark252@2.7.0

Generated by cargo-workspaces
2024-07-03 19:06:35 +00:00
nonam3e
b908053c0c Feat/m31 (#547)
This PR adds support of the m31 Field

---------

Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
2024-07-03 20:48:28 +07:00
Stas
29da36d7be RISC0 example using Polynomial API (#548)
## New Example

This new c++ example shows the basics of RISC0 protocol using our
Polynomial API
2024-07-02 08:00:03 -06:00
HadarIngonyama
4fef542346 MSM - fixed bug in reduction phase (#549)
This PR fixes a bug in the iterative reduction algorithm.
There were unsynchronized threads reading and writing to the same
addresses that caused MSM to fail a small percentage of the time - this is fixed now.
2024-06-30 12:05:55 +03:00
release-bot
f812f071fa Bump rust crates' version
icicle-babybear@2.6.0
icicle-bls12-377@2.6.0
icicle-bls12-381@2.6.0
icicle-bn254@2.6.0
icicle-bw6-761@2.6.0
icicle-core@2.6.0
icicle-cuda-runtime@2.6.0
icicle-grumpkin@2.6.0
icicle-hash@2.6.0
icicle-stark252@2.6.0

Generated by cargo-workspaces
2024-06-24 11:56:28 +00:00
Jeremy Felder
2b07513310 [FEAT]: Golang Bindings for pinned host memory (#519)
## Describe the changes

This PR adds the capability to pin host memory in golang bindings
allowing data transfers to be quicker. Memory can be pinned once for
multiple devices by passing the flag
`cuda_runtime.CudaHostRegisterPortable` or
`cuda_runtime.CudaHostAllocPortable` depending on how pinned memory is
called
2024-06-24 14:03:44 +03:00
464 changed files with 30795 additions and 7411 deletions

View File

@@ -5,21 +5,20 @@ golang:
- go.mod
- .github/workflows/golang.yml
rust:
- wrappers/rust/**/*
- wrappers/rust_v3/**/*
- '!wrappers/rust/README.md'
- .github/workflows/rust.yml
- .github/workflows/v3_rust.yml
cpp:
- icicle/**/*.cu
- icicle/**/*.cuh
- icicle/**/*.cpp
- icicle/**/*.hpp
- icicle/**/*.c
- icicle/**/*.h
- icicle/CMakeLists.txt
- .github/workflows/cpp_cuda.yml
- icicle/cmake/Common.cmake
- icicle/cmake/CurvesCommon.cmake
- icicle/cmake/FieldsCommon.cmake
- icicle_v3/**/*.cu
- icicle_v3/**/*.cuh
- icicle_v3/**/*.cpp
- icicle_v3/**/*.hpp
- icicle_v3/**/*.c
- icicle_v3/**/*.h
- icicle_v3/CMakeLists.txt
- .github/workflows/v3.yml
- icicle_v3/cmake/curve.cmake
- icicle_v3/cmake/field.cmake
examples:
- examples/**/*
- .github/workflows/examples.yml

View File

@@ -4,7 +4,8 @@ on:
pull_request:
branches:
- main
- V2
- V3
- yshekel/V3
jobs:
spelling-checker:

View File

@@ -3,12 +3,10 @@ name: C++/CUDA
on:
pull_request:
branches:
- main
- V2
- main
push:
branches:
- main
- V2
- main
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -73,6 +71,8 @@ jobs:
build_args: -DEXT_FIELD=ON
- name: stark252
build_args: -DEXT_FIELD=OFF
- name: m31
build_args: -DEXT_FIELD=ON
steps:
- name: Checkout Repo
uses: actions/checkout@v4

View File

@@ -1,8 +1,7 @@
# This workflow is a demo of how to run all examples in the Icicle repository.
# For each language directory (c++, Rust, etc.) the workflow
# (1) loops over all examples (msm, ntt, etc.) and
# (2) runs ./compile.sh and ./run.sh in each directory.
# The script ./compile.sh should compile the example and ./run.sh should run it.
# For each language directory (c++, Rust, etc.) the workflow
# (1) loops over all examples (msm, ntt, etc.) and
# (2) runs ./run.sh in each directory.
# Each script should return 0 for success and 1 otherwise.
name: Examples
@@ -10,12 +9,11 @@ name: Examples
on:
pull_request:
branches:
- main
- V2
- V3
- yshekel/V3 # TODO remove when merged to V3
push:
branches:
- main
- V2
- V3
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -25,12 +23,42 @@ jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
extract-cuda-backend-branch:
name: Extract cuda branch name
runs-on: ubuntu-22.04
outputs:
cuda-backend-branch: ${{ steps.extract.outputs.cuda-backend-branch }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Extract Private Branch from PR Description
id: extract
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
DESCRIPTION=$(gh pr view ${{ github.event.pull_request.number }} --json body -q '.body')
echo "PR Description: $DESCRIPTION"
CUDA_BE_BRANCH=$(echo "$DESCRIPTION" | grep -oP 'cuda-backend-branch:\s*\K[^\s]+') || true
if [ -z "$CUDA_BE_BRANCH" ]; then
CUDA_BE_BRANCH="main" # Default branch if not specified
fi
echo "Extracted CUDA Backend Branch: $CUDA_BE_BRANCH"
echo "::set-output name=cuda-backend-branch::$CUDA_BE_BRANCH"
run-examples:
runs-on: [self-hosted, Linux, X64, icicle, examples]
needs: check-changed-files
needs: [check-changed-files, extract-cuda-backend-branch]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Checkout CUDA Backend
uses: actions/checkout@v4
with:
repository: ingonyama-zk/icicle-cuda-backend
path: ./icicle_v3/backend/cuda
token: ${{ secrets.GITHUB_TOKEN }}
ssh-key: ${{ secrets.CUDA_PULL_KEY }}
ref: ${{ needs.extract-branch.outputs.cuda-backend-branch }}
- name: c++ examples
working-directory: ./examples/c++
if: needs.check-changed-files.outputs.cpp_cuda == 'true' || needs.check-changed-files.outputs.examples == 'true'
@@ -40,11 +68,10 @@ jobs:
if [ -d "$dir" ]; then
echo "Running command in $dir"
cd $dir
./compile.sh
./run.sh
./run.sh -d CUDA
cd -
fi
done
done
- name: Rust examples
working-directory: ./examples/rust
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.examples == 'true'
@@ -54,7 +81,7 @@ jobs:
if [ -d "$dir" ]; then
echo "Running command in $dir"
cd $dir
cargo run --release
./run.sh -d CUDA
cd -
fi
done
done

View File

@@ -3,12 +3,10 @@ name: GoLang
on:
pull_request:
branches:
- main
- V2
- main
push:
branches:
- main
- V2
- main
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}

View File

@@ -3,12 +3,10 @@ name: Rust
on:
pull_request:
branches:
- main
- V2
- main
push:
branches:
- main
- V2
- main
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
@@ -62,8 +60,8 @@ jobs:
# We need to limit the number of threads to avoid running out of memory on weaker machines
# ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed separately
run: |
cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2 --ignored
cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2
cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2 --ignored
cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2
- name: Run baby bear tests
working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
@@ -79,6 +77,13 @@ jobs:
cargo test --release --verbose -- --ignored
cargo test --release --verbose
- name: Run m31 tests
working-directory: ./wrappers/rust/icicle-fields/icicle-m31
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
cargo test --release --verbose -- --ignored
cargo test --release --verbose
# build-windows:
# name: Build on Windows
# runs-on: windows-2022

126
.github/workflows/v3.yml vendored Normal file
View File

@@ -0,0 +1,126 @@
name: C++/CUDA
on:
pull_request:
branches:
- V3
- yshekel/V3 # TODO remove when merged to V3
push:
branches:
- V3
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
check-format:
name: Check Code Format
runs-on: ubuntu-22.04
needs: check-changed-files
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check clang-format
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
extract-cuda-backend-branch:
name: Extract cuda branch name
runs-on: ubuntu-22.04
outputs:
cuda-backend-branch: ${{ steps.extract.outputs.cuda-backend-branch }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Extract Private Branch from PR Description
id: extract
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
DESCRIPTION=$(gh pr view ${{ github.event.pull_request.number }} --json body -q '.body')
echo "PR Description: $DESCRIPTION"
CUDA_BE_BRANCH=$(echo "$DESCRIPTION" | grep -oP 'cuda-backend-branch:\s*\K[^\s]+') || true
if [ -z "$CUDA_BE_BRANCH" ]; then
CUDA_BE_BRANCH="main" # Default branch if not specified
fi
echo "Extracted CUDA Backend Branch: $CUDA_BE_BRANCH"
echo "::set-output name=cuda-backend-branch::$CUDA_BE_BRANCH"
test-linux-curve:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format, extract-cuda-backend-branch]
strategy:
matrix:
curve:
- name: bn254
build_args: -DG2=ON -DECNTT=ON
- name: bls12_381
build_args: -DG2=ON -DECNTT=ON
- name: bls12_377
build_args: -DG2=ON -DECNTT=ON
- name: bw6_761
build_args: -DG2=ON -DECNTT=ON
- name: grumpkin
build_args:
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Checkout CUDA Backend
uses: actions/checkout@v4
with:
repository: ingonyama-zk/icicle-cuda-backend
path: ./icicle_v3/backend/cuda
token: ${{ secrets.GITHUB_TOKEN }}
ssh-key: ${{ secrets.CUDA_PULL_KEY }}
ref: ${{ needs.extract-branch.outputs.cuda-backend-branch }}
- name: Build curve
working-directory: ./icicle_v3
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
mkdir -p build && rm -rf build/*
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} -DCUDA_BACKEND=local -S . -B build
cmake --build build -j
- name: Run C++ curve Tests
working-directory: ./icicle_v3/build/tests
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ctest --verbose
test-linux-field:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format, extract-cuda-backend-branch]
strategy:
matrix:
field:
- name: babybear
build_args: -DEXT_FIELD=ON
- name: stark252
build_args: -DEXT_FIELD=OFF
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Checkout CUDA Backend
uses: actions/checkout@v4
with:
repository: ingonyama-zk/icicle-cuda-backend
path: ./icicle_v3/backend/cuda
token: ${{ secrets.GITHUB_TOKEN }}
ssh-key: ${{ secrets.CUDA_PULL_KEY }}
ref: ${{ needs.extract-branch.outputs.cuda-backend-branch }}
- name: Build field
working-directory: ./icicle_v3
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: |
mkdir -p build && rm -rf build/*
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field.name }} ${{ matrix.field.build_args }} -DCUDA_BACKEND=local -S . -B build
cmake --build build -j
- name: Run C++ field Tests
working-directory: ./icicle_v3/build/tests
if: needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ctest --verbose

82
.github/workflows/v3_rust.yml vendored Normal file
View File

@@ -0,0 +1,82 @@
name: Rust
on:
pull_request:
branches:
- V3
- yshekel/V3 # TODO remove when merged to V3
push:
branches:
- V3
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changed-files:
uses: ./.github/workflows/check-changed-files.yml
check-format:
name: Check Code Format
runs-on: ubuntu-22.04
needs: check-changed-files
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check rustfmt
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
working-directory: ./wrappers/rust
# "-name target -prune" removes searching in any directory named "target"
# Formatting by single file is necessary due to generated files not being present
# before building the project.
# e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
# causing rustfmt to fail.
run: if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
extract-cuda-backend-branch:
name: Extract cuda branch name
runs-on: ubuntu-22.04
outputs:
cuda-backend-branch: ${{ steps.extract.outputs.cuda-backend-branch }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Extract Private Branch from PR Description
id: extract
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
DESCRIPTION=$(gh pr view ${{ github.event.pull_request.number }} --json body -q '.body')
echo "PR Description: $DESCRIPTION"
CUDA_BE_BRANCH=$(echo "$DESCRIPTION" | grep -oP 'cuda-backend-branch:\s*\K[^\s]+') || true
if [ -z "$CUDA_BE_BRANCH" ]; then
CUDA_BE_BRANCH="main" # Default branch if not specified
fi
echo "Extracted CUDA Backend Branch: $CUDA_BE_BRANCH"
echo "::set-output name=cuda-backend-branch::$CUDA_BE_BRANCH"
test-linux:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format, extract-cuda-backend-branch]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Checkout CUDA Backend
uses: actions/checkout@v4
with:
repository: ingonyama-zk/icicle-cuda-backend
path: ./icicle_v3/backend/cuda
token: ${{ secrets.GITHUB_TOKEN }}
ssh-key: ${{ secrets.CUDA_PULL_KEY }}
ref: ${{ needs.extract-branch.outputs.cuda-backend-branch }}
- name: Run tests
working-directory: ./wrappers/rust_v3
if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# tests are split to phases since NTT domain is global but tests have conflicting requirements
run: |
cargo build --release --verbose --features=g2,ec_ntt
cargo test --workspace --release --verbose --features=g2,ec_ntt,cuda_backend -- --skip phase
cargo test phase2 --workspace --release --verbose --features=g2,ec_ntt,cuda_backend
cargo test phase3 --workspace --release --verbose --features=g2,ec_ntt,cuda_backend

2
.gitignore vendored
View File

@@ -17,4 +17,4 @@
**/Cargo.lock
**/icicle/build/
**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
**/build*
**/build/*

View File

@@ -25,7 +25,7 @@ func main() {
input := createHostSliceFromHexString("1725b6")
outHost256 := make(core.HostSlice[uint8], 32)
cfg := keccak.GetDefaultKeccakConfig()
cfg := keccak.GetDefaultHashConfig()
e := keccak.Keccak256(input, int32(input.Len()), 1, outHost256, &cfg)
if e.CudaErrorCode != cr.CudaSuccess {
panic("Keccak256 hashing failed")
@@ -49,8 +49,8 @@ func main() {
## Keccak Methods
```go
func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError
func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError
func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
```
### Parameters
@@ -59,18 +59,18 @@ func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int3
- **`inputBlockSize`**: An integer specifying the size of the input data for a single hash.
- **`numberOfBlocks`**: An integer specifying the number of results in the hash batch.
- **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
- **`config`**: A pointer to a `KeccakConfig` object, which contains various configuration options for the Keccak256 operation.
- **`config`**: A pointer to a `HashConfig` object, which contains various configuration options for the Keccak256 operation.
### Return Value
- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.
## KeccakConfig
## HashConfig
The `KeccakConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
The `HashConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
```go
type KeccakConfig struct {
type HashConfig struct {
Ctx cr.DeviceContext
areInputsOnDevice bool
areOutputsOnDevice bool
@@ -87,8 +87,8 @@ type KeccakConfig struct {
### Default Configuration
Use `GetDefaultKeccakConfig` to obtain a default configuration, which can then be customized as needed.
Use `GetDefaultHashConfig` to obtain a default configuration, which can then be customized as needed.
```go
func GetDefaultKeccakConfig() KeccakConfig
func GetDefaultHashConfig() HashConfig
```

View File

@@ -14,9 +14,62 @@ At its core, Keccak consists of a permutation function operating on a state arra
## Using Keccak
ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree.
ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree or running multiple hashes in parallel.
### Supported Bindings
- [Golang](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/hash/keccak)
- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
### Example usage
This is an example of running 1024 Keccak-256 hashes in parallel, where input strings are of size 136 bytes:
```rust
use icicle_core::hash::HashConfig;
use icicle_cuda_runtime::memory::HostSlice;
use icicle_hash::keccak::keccak256;
let config = HashConfig::default();
let input_block_len = 136;
let number_of_hashes = 1024;
let preimages = vec![1u8; number_of_hashes * input_block_len];
let mut digests = vec![0u8; number_of_hashes * 64];
let preimages_slice = HostSlice::from_slice(&preimages);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
keccak256(
preimages_slice,
input_block_len as u32,
number_of_hashes as u32,
digests_slice,
&config,
)
.unwrap();
```
### Merkle Tree
You can build a keccak merkle tree using the corresponding functions:
```rust
use icicle_core::tree::{merkle_tree_digests_len, TreeBuilderConfig};
use icicle_cuda_runtime::memory::HostSlice;
use icicle_hash::keccak::build_keccak256_merkle_tree;
let mut config = TreeBuilderConfig::default();
config.arity = 2;
let height = 22;
let input_block_len = 136;
let leaves = vec![1u8; (1 << height) * input_block_len];
let mut digests = vec![0u64; merkle_tree_digests_len((height + 1) as u32, 2, 1)];
let leaves_slice = HostSlice::from_slice(&leaves);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
build_keccak256_merkle_tree(leaves_slice, digests_slice, height, input_block_len, &config).unwrap();
```
In the example above, a binary tree of height 22 is being built. Each leaf is considered to be a 136 byte long array. The leaves and digests are aligned in a flat array. You can also use keccak512 in `build_keccak512_merkle_tree` function.

View File

@@ -53,6 +53,7 @@ So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elem
### Supported Bindings
[`Go`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/golang/curves/bn254/poseidon/poseidon.go)
[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon)
### Constants
@@ -91,8 +92,6 @@ primitive_element = 7 # bls12-381
# primitive_element = 15 # bw6-761
```
We only support `alpha = 5` so if you want to use another alpha for S-box please reach out on discord or open a github issue.
### Rust API
This is the most basic way to use the Poseidon API.
@@ -101,71 +100,58 @@ This is the most basic way to use the Poseidon API.
let test_size = 1 << 10;
let arity = 2u32;
let ctx = get_default_device_context();
let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
let config = PoseidonConfig::default();
let poseidon = Poseidon::load(arity, &ctx).unwrap();
let config = HashConfig::default();
let inputs = vec![F::one(); test_size * arity as usize];
let outputs = vec![F::zero(); test_size];
let mut input_slice = HostOrDeviceSlice::on_host(inputs);
let mut output_slice = HostOrDeviceSlice::on_host(outputs);
poseidon_hash_many::<F>(
poseidon.hash_many::<F>(
&mut input_slice,
&mut output_slice,
test_size as u32,
arity as u32,
&constants,
1, // Output length
&config,
)
.unwrap();
```
The `PoseidonConfig::default()` can be modified, by default the inputs and outputs are set to be on `Host` for example.
The `HashConfig` can be modified, by default the inputs and outputs are set to be on `Host` for example.
```rust
impl<'a> Default for PoseidonConfig<'a> {
impl<'a> Default for HashConfig<'a> {
fn default() -> Self {
let ctx = get_default_device_context();
Self {
ctx,
are_inputs_on_device: false,
are_outputs_on_device: false,
input_is_a_state: false,
aligned: false,
loop_state: false,
is_async: false,
}
}
}
```
In the example above `load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.
In the example above `Poseidon::load(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.
```rust
let ctx = get_default_device_context();
let cargo_manifest_dir = env!("CARGO_MANIFEST_DIR");
let constants_file = PathBuf::from(cargo_manifest_dir)
.join("tests")
.join(format!("{}_constants.bin", field_prefix));
let mut constants_buf = vec![];
File::open(constants_file)
.unwrap()
.read_to_end(&mut constants_buf)
.unwrap();
let mut custom_constants = vec![];
for chunk in constants_buf.chunks(field_bytes) {
custom_constants.push(F::from_bytes_le(chunk));
}
let custom_constants = create_optimized_poseidon_constants::<F>(
arity as u32,
&ctx,
full_rounds_half,
partial_rounds,
&mut custom_constants,
)
.unwrap();
let custom_poseidon = Poseidon::new(
arity, // The arity of poseidon hash. The width will be equal to arity + 1
alpha, // The S-box power
full_rounds_half,
partial_rounds,
round_constants,
mds_matrix,
non_sparse_matrix,
sparse_matrices,
domain_tag,
ctx,
)
.unwrap();
```
## The Tree Builder
@@ -175,21 +161,34 @@ The tree builder allows you to build Merkle trees using Poseidon.
You can define both the tree's `height` and its `arity`. The tree `height` determines the number of layers in the tree, including the root and the leaf layer. The `arity` determines how many children each internal node can have.
```rust
let height = 20;
let arity = 2;
let leaves = vec![F::one(); 1 << (height - 1)];
let mut digests = vec![F::zero(); merkle_tree_digests_len(height, arity)];
let mut leaves_slice = HostOrDeviceSlice::on_host(leaves);
let ctx = get_default_device_context();
let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap()
use icicle_bn254::tree::Bn254TreeBuilder;
use icicle_bn254::poseidon::Poseidon;
let mut config = TreeBuilderConfig::default();
config.keep_rows = 1;
build_poseidon_merkle_tree::<F>(&mut leaves_slice, &mut digests, height, arity, &constants, &config).unwrap();
let arity = 2;
config.arity = arity as u32;
let input_block_len = arity;
let leaves = vec![F::one(); (1 << height) * arity];
let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];
println!("Root: {:?}", digests[0..1][0]);
let leaves_slice = HostSlice::from_slice(&leaves);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
let ctx = device_context::DeviceContext::default();
let hash = Poseidon::load(2, &ctx).unwrap();
let mut config = TreeBuilderConfig::default();
config.keep_rows = 5;
Bn254TreeBuilder::build_merkle_tree(
leaves_slice,
digests_slice,
height,
input_block_len,
&hash,
&hash,
&config,
)
.unwrap();
```
Similar to Poseidon, you can also configure the Tree Builder `TreeBuilderConfig::default()`

View File

@@ -0,0 +1,88 @@
# Poseidon2
[Poseidon2](https://eprint.iacr.org/2023/323) is a recently released optimized version of Poseidon1. The two versions differ in two crucial points. First, Poseidon is a sponge hash function, while Poseidon2 can be either a sponge or a compression function depending on the use case. Secondly, Poseidon2 is instantiated by new and more efficient linear layers with respect to Poseidon. These changes decrease the number of multiplications in the linear layer by up to 90% and the number of constraints in Plonk circuits by up to 70%. This makes Poseidon2 currently the fastest arithmetization-oriented hash function without lookups.
## Using Poseidon2
ICICLE Poseidon2 is implemented for GPU and parallelization is performed for each state.
We calculate multiple hash-sums over multiple pre-images in parallel, rather than going block by block over the input vector.
For example, for Poseidon2 of width 16, input rate 8, output elements 8 and input of size 1024 * 8, we would expect 1024 * 8 elements of output. Which means each input block would be of size 8, resulting in 1024 Poseidon2 hashes being performed.
### Supported Bindings
[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon2)
### Constants
Poseidon2 is also extremely customizable and using different constants will produce different hashes, security levels and performance results.
We support pre-calculated constants for each of the [supported curves](../core#supported-curves-and-operations). The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon2/constants) and are labeled clearly per curve `<curve_name>_poseidon2.h`.
You can also use your own set of constants as shown [here](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs#L290)
### Rust API
This is the most basic way to use the Poseidon2 API.
```rust
let test_size = 1 << 10;
let width = 16;
let rate = 8;
let ctx = get_default_device_context();
let poseidon = Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
let config = HashConfig::default();
let inputs = vec![F::one(); test_size * rate as usize];
let outputs = vec![F::zero(); test_size];
let mut input_slice = HostOrDeviceSlice::on_host(inputs);
let mut output_slice = HostOrDeviceSlice::on_host(outputs);
poseidon.hash_many::<F>(
&mut input_slice,
&mut output_slice,
test_size as u32,
rate as u32,
8, // Output length
&config,
)
.unwrap();
```
In the example above `Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();` is used to load the correct constants based on width and curve. Here, the default MDS matrices and diffusion are used. If you want to get a Plonky3 compliant version, set them to `MdsType::Plonky` and `DiffusionStrategy::Montgomery` respectively.
## The Tree Builder
Similar to Poseidon1, you can use Poseidon2 in a tree builder.
```rust
use icicle_bn254::tree::Bn254TreeBuilder;
use icicle_bn254::poseidon2::Poseidon2;
let mut config = TreeBuilderConfig::default();
let arity = 2;
config.arity = arity as u32;
let input_block_len = arity;
let leaves = vec![F::one(); (1 << height) * arity];
let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];
let leaves_slice = HostSlice::from_slice(&leaves);
let digests_slice = HostSlice::from_mut_slice(&mut digests);
let ctx = device_context::DeviceContext::default();
let hash = Poseidon2::load(arity, arity, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
let mut config = TreeBuilderConfig::default();
config.keep_rows = 5;
Bn254TreeBuilder::build_merkle_tree(
leaves_slice,
digests_slice,
height,
input_block_len,
&hash,
&hash,
&config,
)
.unwrap();
```

View File

@@ -4,7 +4,7 @@
```rust
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
use icicle_hash::keccak::{keccak256, KeccakConfig};
use icicle_hash::keccak::{keccak256, HashConfig};
use rand::{self, Rng};
fn main() {
@@ -14,7 +14,7 @@ fn main() {
let input = HostSlice::<u8>::from_slice(initial_data.as_slice());
let mut output = DeviceVec::<u8>::cuda_malloc(32).unwrap();
let mut config = KeccakConfig::default();
let mut config = HashConfig::default();
keccak256(input, initial_data.len() as i32, 1, &mut output[..], &mut config).expect("Failed to execute keccak256 hashing");
let mut output_host = vec![0_u8; 32];
@@ -32,7 +32,7 @@ pub fn keccak256(
input_block_size: i32,
number_of_blocks: i32,
output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
config: &mut KeccakConfig,
config: &mut HashConfig,
) -> IcicleResult<()>
pub fn keccak512(
@@ -40,7 +40,7 @@ pub fn keccak512(
input_block_size: i32,
number_of_blocks: i32,
output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
config: &mut KeccakConfig,
config: &mut HashConfig,
) -> IcicleResult<()>
```
@@ -50,18 +50,18 @@ pub fn keccak512(
- **`input_block_size`**: An integer specifying the size of the input data for a single hash.
- **`number_of_blocks`**: An integer specifying the number of results in the hash batch.
- **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
- **`config`**: A pointer to a `KeccakConfig` object, which contains various configuration options for the Keccak256 operation.
- **`config`**: A pointer to a `HashConfig` object, which contains various configuration options for the Keccak256 operation.
### Return Value
- **`IcicleResult`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.
## KeccakConfig
## HashConfig
The `KeccakConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
The `HashConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
```rust
pub struct KeccakConfig<'a> {
pub struct HashConfig<'a> {
pub ctx: DeviceContext<'a>,
pub are_inputs_on_device: bool,
pub are_outputs_on_device: bool,
@@ -81,7 +81,7 @@ pub struct KeccakConfig<'a> {
Example initialization with default settings:
```rust
let default_config = KeccakConfig::default();
let default_config = HashConfig::default();
```
Customizing the configuration:

19
docs/package-lock.json generated
View File

@@ -3680,6 +3680,8 @@
"version": "8.12.0",
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
"integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
"optional": true,
"peer": true,
"dependencies": {
"fast-deep-equal": "^3.1.1",
"json-schema-traverse": "^1.0.0",
@@ -3694,7 +3696,9 @@
"node_modules/ajv-formats/node_modules/json-schema-traverse": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
"optional": true,
"peer": true
},
"node_modules/ajv-keywords": {
"version": "3.5.2",
@@ -16340,14 +16344,13 @@
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz",
"integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==",
"requires": {
"ajv": "^8.0.0"
},
"requires": {},
"dependencies": {
"ajv": {
"version": "8.12.0",
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
"version": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
"integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
"optional": true,
"peer": true,
"requires": {
"fast-deep-equal": "^3.1.1",
"json-schema-traverse": "^1.0.0",
@@ -16358,7 +16361,9 @@
"json-schema-traverse": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
"optional": true,
"peer": true
}
}
},

View File

@@ -53,6 +53,11 @@ module.exports = {
label: "Poseidon Hash",
id: "icicle/primitives/poseidon",
},
{
type: "doc",
label: "Poseidon2 Hash",
id: "icicle/primitives/poseidon2",
},
],
},
{

View File

@@ -1,23 +1,16 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
project(example)
add_executable(example example.cpp)
target_include_directories(example PRIVATE "../../../icicle_v3/include" "..")
target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
message("${CMAKE_BINARY_DIR}/icicle")
target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
if(BACKEND_DIR)
add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
endif()
add_executable(
example
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -17,17 +17,19 @@ Typically, you concurrently
## Best-Practices
1. Use three separate CUDA streams for Download, Upload, and Compute operations
2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
3. Use in-place NTT to save on device memory.
1. Use three separate streams for Download to device, Upload from device, and Compute operations
2. Future: Use pinned (page-locked) memory on host to speed data bus transfers.
3. Compute in-place NTT.
## Running the example
To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`
To change the default curve BN254, edit `run.sh` and `CMakeLists.txt`
```sh
./compile.sh
./run.sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).

View File

@@ -1,16 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -0,0 +1,125 @@
#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include "icicle/runtime.h"
#include "icicle/api/bn254.h"
using namespace bn254;
#include "examples_utils.h"
void initialize_input(const unsigned ntt_size, const unsigned batch_size, scalar_t* elements)
{
for (unsigned i = 0; i < ntt_size * batch_size; i++) {
elements[i] = scalar_t::from(i + 1);
}
}
int main(int argc, char* argv[])
{
try_load_and_set_backend_device(argc, argv);
// set these parameters to match the desired NTT size and batch size
const unsigned log_ntt_size = 20;
const unsigned batch_size = 16;
scalar_t basic_root = scalar_t::omega(log_ntt_size);
const unsigned ntt_size = 1 << log_ntt_size;
std::cout << "log NTT size: " << log_ntt_size << std::endl;
std::cout << "Batch size: " << batch_size << std::endl;
// Create separate streams for overlapping data transfers and kernel execution.
icicleStreamHandle stream_compute, stream_h2d, stream_d2h;
ICICLE_CHECK(icicle_create_stream(&stream_compute));
ICICLE_CHECK(icicle_create_stream(&stream_h2d));
ICICLE_CHECK(icicle_create_stream(&stream_d2h));
// Initialize NTT domain
std::cout << "Init NTT domain" << std::endl;
auto ntt_init_domain_cfg = default_ntt_init_domain_config();
// set CUDA backend specific flag for init_domain
ConfigExtension backend_cfg_ext;
backend_cfg_ext.set("fast_twiddles", true);
ntt_init_domain_cfg.ext = &backend_cfg_ext;
ICICLE_CHECK(bn254_ntt_init_domain(&basic_root, ntt_init_domain_cfg));
std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
int nof_blocks = 32;
int block_size = ntt_size * batch_size / nof_blocks;
std::cout << "Number of blocks: " << nof_blocks << ", block size: " << block_size << " Bytes" << std::endl;
// on-host pinned data
scalar_t* h_inp[2];
scalar_t* h_out[2];
for (int i = 0; i < 2; i++) {
h_inp[i] = new scalar_t[ntt_size * batch_size];
h_out[i] = new scalar_t[ntt_size * batch_size];
}
// on-device in-place data
// we need two on-device vectors to overlap data transfers with NTT kernel execution
scalar_t* d_vec[2];
for (int i = 0; i < 2; i++) {
ICICLE_CHECK(icicle_malloc((void**)&d_vec[i], sizeof(scalar_t) * ntt_size * batch_size));
}
// initialize input data
initialize_input(ntt_size, batch_size, h_inp[0]);
initialize_input(ntt_size, batch_size, h_inp[1]);
// ntt configuration
NTTConfig<scalar_t> config_compute = default_ntt_config<scalar_t>();
config_compute.batch_size = batch_size;
config_compute.are_inputs_on_device = true;
config_compute.are_outputs_on_device = true;
config_compute.is_async = true;
config_compute.stream = stream_compute;
// backend specific config extension
ConfigExtension ntt_cfg_ext;
ntt_cfg_ext.set("ntt_algorithm", 2); // mixed-radix
config_compute.ext = &ntt_cfg_ext;
for (int run = 0; run < 10; run++) {
int vec_compute = run % 2;
int vec_transfer = (run + 1) % 2;
std::cout << "Run: " << run << std::endl;
std::cout << "Compute Vector: " << vec_compute << std::endl;
std::cout << "Transfer Vector: " << vec_transfer << std::endl;
START_TIMER(inplace);
bn254_ntt(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
// we have to delay upload to device relative to download from device by one block: preserve write after read
for (int i = 0; i <= nof_blocks; i++) {
if (i < nof_blocks) {
// copy result back from device to host
ICICLE_CHECK(icicle_copy_async(
&h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(scalar_t) * block_size,
stream_d2h));
}
if (i > 0) {
// copy next input from host to device to alternate buffer
ICICLE_CHECK(icicle_copy_async(
&d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
sizeof(scalar_t) * block_size, stream_h2d));
}
// synchronize upload and download at the end of the block to ensure data integrity
ICICLE_CHECK(icicle_stream_synchronize(stream_d2h));
ICICLE_CHECK(icicle_stream_synchronize(stream_h2d));
}
// synchronize compute stream with the end of the computation
ICICLE_CHECK(icicle_stream_synchronize(stream_compute));
END_TIMER(inplace, "Concurrent In-Place NTT");
}
// Clean-up
for (int i = 0; i < 2; i++) {
ICICLE_CHECK(icicle_free(d_vec[i]));
delete[](h_inp[i]);
delete[](h_out[i]);
}
ICICLE_CHECK(icicle_destroy_stream(stream_compute));
ICICLE_CHECK(icicle_destroy_stream(stream_d2h));
ICICLE_CHECK(icicle_destroy_stream(stream_h2d));
return 0;
}

View File

@@ -1,149 +0,0 @@
#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include "curves/params/bn254.cuh"
#include "api/bn254.h"
using namespace bn254;
using namespace ntt;
const std::string curve = "BN254";
typedef scalar_t S;
typedef scalar_t E;
const unsigned max_log_ntt_size = 27;
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
{
for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
elements[i] = E::from(i + 1);
}
}
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char** argv)
{
cudaDeviceReset();
cudaDeviceProp deviceProperties;
int deviceId = 0;
cudaGetDeviceProperties(&deviceProperties, deviceId);
std::string gpu_full_name = deviceProperties.name;
std::cout << gpu_full_name << std::endl;
std::string gpu_name = gpu_full_name;
std::cout << "Curve: " << curve << std::endl;
S basic_root = S::omega(max_log_ntt_size);
// change these parameters to match the desired NTT size and batch size
const unsigned log_ntt_size = 22;
const unsigned nof_ntts = 16;
std::cout << "log NTT size: " << log_ntt_size << std::endl;
const unsigned ntt_size = 1 << log_ntt_size;
std::cout << "Batch size: " << nof_ntts << std::endl;
// Create separate CUDA streams for overlapping data transfers and kernel execution.
cudaStream_t stream_compute, stream_h2d, stream_d2h;
cudaStreamCreate(&stream_compute);
cudaStreamCreate(&stream_h2d);
cudaStreamCreate(&stream_d2h);
// Create device context for NTT computation
auto ctx_compute = device_context::DeviceContext{
stream_compute, // stream
0, // device_id
0, // mempool
};
// Initialize NTT domain and configuration
bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
NTTConfig<S> config_compute = default_ntt_config<S>(ctx_compute);
config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
config_compute.batch_size = nof_ntts;
config_compute.are_inputs_on_device = true;
config_compute.are_outputs_on_device = true;
config_compute.is_async = true;
std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
int nof_blocks = 32;
std::cout << "Number of blocks: " << nof_blocks << std::endl;
int block_size = ntt_size * nof_ntts / nof_blocks;
// on-host pinned data
E* h_inp[2];
E* h_out[2];
for (int i = 0; i < 2; i++) {
cudaHostAlloc((void**)&h_inp[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
cudaHostAlloc((void**)&h_out[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
}
// on-device in-place data
// we need two on-device vectors to overlap data transfers with NTT kernel execution
E* d_vec[2];
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&d_vec[i], sizeof(E) * ntt_size * nof_ntts);
}
// initialize input data
initialize_input(ntt_size, nof_ntts, h_inp[0]);
initialize_input(ntt_size, nof_ntts, h_inp[1]);
cudaEvent_t compute_start, compute_stop;
cudaEventCreate(&compute_start);
cudaEventCreate(&compute_stop);
for (int run = 0; run < 10; run++) {
int vec_compute = run % 2;
int vec_transfer = (run + 1) % 2;
std::cout << "Run: " << run << std::endl;
std::cout << "Compute Vector: " << vec_compute << std::endl;
std::cout << "Transfer Vector: " << vec_transfer << std::endl;
START_TIMER(inplace);
cudaEventRecord(compute_start, stream_compute);
bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
cudaEventRecord(compute_stop, stream_compute);
// we have to delay upload to device relative to download from device by one block: preserve write after read
for (int i = 0; i <= nof_blocks; i++) {
if (i < nof_blocks) {
cudaMemcpyAsync(
&h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(E) * block_size,
cudaMemcpyDeviceToHost, stream_d2h);
}
if (i > 0) {
cudaMemcpyAsync(
&d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
sizeof(E) * block_size, cudaMemcpyHostToDevice, stream_h2d);
}
// synchronize upload and download at the end of the block to ensure data integrity
cudaStreamSynchronize(stream_d2h);
cudaStreamSynchronize(stream_h2d);
}
// synchronize compute stream with the end of the computation
cudaEventSynchronize(compute_stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
END_TIMER(inplace, "Concurrent In-Place NTT");
std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
};
// Clean-up
for (int i = 0; i < 2; i++) {
cudaFree(d_vec[i]);
cudaFreeHost(h_inp[i]);
cudaFreeHost(h_out[i]);
}
cudaEventDestroy(compute_start);
cudaEventDestroy(compute_stop);
cudaStreamDestroy(stream_compute);
cudaStreamDestroy(stream_d2h);
cudaStreamDestroy(stream_h2d);
return 0;
}

View File

@@ -1,2 +1,65 @@
#!/bin/bash
./build/example/example
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_BACKEND_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_BACKEND_DIR}" ]; then
echo "Building icicle with CUDA backend"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -S "${ICILE_DIR}" -B build/icicle
fi
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
cmake --build build/icicle -j
cmake --build build/example -j
./build/example/example "$DEVICE_TYPE" "$BACKEND_INSTALL_DIR"

View File

@@ -0,0 +1,38 @@
#pragma once
#include <chrono>
#include "icicle/runtime.h"
// Timer
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
// Load and choose backend
void try_load_and_set_backend_device(int argc = 0, char** argv = nullptr)
{
if (argc > 2 && 0 != strcmp(argv[2], "")) {
const char* backend_install_dir = argv[2];
std::cout << "Trying to load and backend device from " << backend_install_dir << std::endl;
ICICLE_CHECK(icicle_load_backend(backend_install_dir, true));
}
const char* selected_device = argc > 1 ? argv[1] : nullptr;
if (selected_device) {
std::cout << "selecting " << selected_device << " device" << std::endl;
ICICLE_CHECK(icicle_set_device(selected_device));
return;
}
// trying to choose CUDA if available, or fallback to CPU otherwise (default device)
const bool is_cuda_device_available = (eIcicleError::SUCCESS == icicle_is_device_avialable("CUDA"));
if (is_cuda_device_available) {
Device device = {"CUDA", 0}; // GPU-0
std::cout << "setting " << device << std::endl;
ICICLE_CHECK(icicle_set_device(device));
return;
}
std::cout << "CUDA device not available, falling back to CPU" << std::endl;
}

View File

@@ -1,23 +1,16 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
project(example)
add_executable(example example.cpp)
target_include_directories(example PRIVATE "../../../icicle_v3/include" "..")
target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
message("${CMAKE_BINARY_DIR}/icicle")
target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
if(BACKEND_DIR)
add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
endif()
add_executable(
example
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -1,46 +1,43 @@
# Icicle example: Muli-Scalar Multiplication (MSM)
## Best-Practices
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
## Key-Takeaway
`Icicle` provides CUDA C++ template function `MSM` to accelerate [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
## Concise Usage Explanation
1. Select the curve
2. Include an MSM template
3. Configure MSM
4. Call the template
1. Include the curve api
2. Configure MSM
3. Call msm api
```c++
#define CURVE_ID 1
#include "icicle/appUtils/msm/msm.cu"
#include "icicle/api/bn254.h"
...
msm::MSMConfig config = {...};
MSMConfig config = default_msm_config();
...
msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, size, config, &result);
bn254_msm(scalars, points, size, config, &result);
```
In this example we use `BN254` curve (`CURVE_ID=1`). The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` use affine coordinates, and `result` uses projective coordinates.
In this example we use `BN254` curve. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` use affine coordinates, and `result` uses projective coordinates.
**Parameters:**
The configuration is passed to the kernel as a structure of type `msm::MSMConfig`. Some of the most important fields are listed below:
The configuration is passed to the kernel as a structure of type `MSMConfig`. Some of the most important fields are listed below:
- `are_scalars_on_device`, `are_points_on_device`, `are_results_on_device`: location of the data
- `is_async`: blocking vs. non-blocking kernel call
- `large_bucket_factor`: distinguishes between large bucket and normal bucket sizes. If there is a scalar distribution that is skewed heavily to a few values we can operate on those separately from the rest of the values. The ideal value here can vary by circuit (based on the distribution of scalars) but start with 10 and adjust to see if it improves performance.
- In addition can pass backend-specific params via config.extConfig. For example CUDA backend accepts a `large_bucket_factor` param.
## Running the example
- `cd` to your example directory
- compile with `./compile.sh`
- run with `./run.sh`
```sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
## What's in the example
@@ -49,4 +46,4 @@ The configuration is passed to the kernel as a structure of type `msm::MSMConfig
3. Configure and execute MSM using on-host data
4. Copy inputs on-device
5. Configure and execute MSM using on-device data
6. Repeat the above steps for G2 points
6. Repeat step 3 G2 msm points

View File

@@ -1,15 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -0,0 +1,101 @@
#include <fstream>
#include <iostream>
#include <iomanip>
#include "icicle/runtime.h"
#include "icicle/api/bn254.h"
using namespace bn254;
#include "examples_utils.h"
int main(int argc, char* argv[])
{
try_load_and_set_backend_device(argc, argv);
std::cout << "\nIcicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
std::cout << "Example parameters" << std::endl;
int batch_size = 1;
unsigned msm_size = 1 << 10;
int N = batch_size * msm_size;
std::cout << "Batch size: " << batch_size << std::endl;
std::cout << "MSM size: " << msm_size << std::endl;
std::cout << "\nPart I: use G1 points" << std::endl;
std::cout << "Generating random inputs on-host" << std::endl;
auto scalars = std::make_unique<scalar_t[]>(N);
auto points = std::make_unique<affine_t[]>(N);
projective_t result;
scalar_t::rand_host_many(scalars.get(), N);
projective_t::rand_host_many(points.get(), N);
std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
auto config = default_msm_config();
config.batch_size = batch_size;
std::cout << "\nRunning MSM kernel with on-host inputs" << std::endl;
// Execute the MSM kernel
START_TIMER(MSM_host_mem);
ICICLE_CHECK(bn254_msm(scalars.get(), points.get(), msm_size, config, &result));
END_TIMER(MSM_host_mem, "MSM from host-memory took");
std::cout << projective_t::to_affine(result) << std::endl;
DeviceProperties device_props;
ICICLE_CHECK(icicle_get_device_properties(device_props));
// If device does not share memory with host, copy inputs explicitly and execute msm with device pointers
if (!device_props.using_host_memory) {
std::cout << "\nReconfiguring MSM to use on-device inputs" << std::endl;
config.are_results_on_device = true;
config.are_scalars_on_device = true;
config.are_points_on_device = true;
std::cout << "Copying inputs to-device" << std::endl;
scalar_t* scalars_d;
affine_t* points_d;
projective_t* result_d;
ICICLE_CHECK(icicle_malloc((void**)&scalars_d, sizeof(scalar_t) * N));
ICICLE_CHECK(icicle_malloc((void**)&points_d, sizeof(affine_t) * N));
ICICLE_CHECK(icicle_malloc((void**)&result_d, sizeof(projective_t)));
ICICLE_CHECK(icicle_copy(scalars_d, scalars.get(), sizeof(scalar_t) * N));
ICICLE_CHECK(icicle_copy(points_d, points.get(), sizeof(affine_t) * N));
std::cout << "Running MSM kernel with on-device inputs" << std::endl;
// Execute the MSM kernel
START_TIMER(MSM_device_mem);
ICICLE_CHECK(msm(scalars_d, points_d, msm_size, config, result_d));
END_TIMER(MSM_device_mem, "MSM from device-memory took");
// Copy the result back to the host
icicle_copy(&result, result_d, sizeof(projective_t));
// Print the result
std::cout << projective_t::to_affine(result) << std::endl;
// Free the device memory
icicle_free(scalars_d);
icicle_free(points_d);
icicle_free(result_d);
}
std::cout << "\nPart II: use G2 points" << std::endl;
std::cout << "Generating random inputs on-host" << std::endl;
// use the same scalars
auto g2_points = std::make_unique<g2_affine_t[]>(N);
g2_projective_t::rand_host_many(g2_points.get(), N);
std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
config.are_results_on_device = false;
config.are_scalars_on_device = false;
config.are_points_on_device = false;
g2_projective_t g2_result;
START_TIMER(MSM_g2);
ICICLE_CHECK(bn254_g2_msm(scalars.get(), g2_points.get(), msm_size, config, &g2_result));
END_TIMER(MSM_g2, "MSM G2 from host-memory took");
std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
// Similar to G1 MSM, can explicitly copy to device and execute the G2 MSM using device pointers
return 0;
}

View File

@@ -1,124 +0,0 @@
#include <fstream>
#include <iostream>
#include <iomanip>
#include "api/bn254.h"
using namespace bn254;
int main(int argc, char* argv[])
{
std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
std::cout << "Example parameters" << std::endl;
int batch_size = 1;
std::cout << "Batch size: " << batch_size << std::endl;
unsigned msm_size = 1048576;
std::cout << "MSM size: " << msm_size << std::endl;
int N = batch_size * msm_size;
std::cout << "Part I: use G1 points" << std::endl;
std::cout << "Generating random inputs on-host" << std::endl;
scalar_t* scalars = new scalar_t[N];
affine_t* points = new affine_t[N];
projective_t result;
scalar_t::rand_host_many(scalars, N);
projective_t::rand_host_many_affine(points, N);
std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
device_context::DeviceContext ctx = device_context::get_default_device_context();
msm::MSMConfig config = {
ctx, // ctx
0, // points_size
1, // precompute_factor
0, // c
0, // bitsize
10, // large_bucket_factor
1, // batch_size
false, // are_scalars_on_device
false, // are_scalars_montgomery_form
false, // are_points_on_device
false, // are_points_montgomery_form
false, // are_results_on_device
false, // is_big_triangle
false, // is_async
};
config.batch_size = batch_size;
std::cout << "Running MSM kernel with on-host inputs" << std::endl;
cudaStream_t stream = config.ctx.stream;
// Execute the MSM kernel
bn254_msm_cuda(scalars, points, msm_size, config, &result);
std::cout << projective_t::to_affine(result) << std::endl;
std::cout << "Copying inputs on-device" << std::endl;
scalar_t* scalars_d;
affine_t* points_d;
projective_t* result_d;
cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
cudaMalloc(&points_d, sizeof(affine_t) * N);
cudaMalloc(&result_d, sizeof(projective_t));
cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
cudaMemcpy(points_d, points, sizeof(affine_t) * N, cudaMemcpyHostToDevice);
std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
config.are_results_on_device = true;
config.are_scalars_on_device = true;
config.are_points_on_device = true;
std::cout << "Running MSM kernel with on-device inputs" << std::endl;
// Execute the MSM kernel
bn254_msm_cuda(scalars_d, points_d, msm_size, config, result_d);
// Copy the result back to the host
cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
// Print the result
std::cout << projective_t::to_affine(result) << std::endl;
// Free the device memory
cudaFree(scalars_d);
cudaFree(points_d);
cudaFree(result_d);
// Free the host memory, keep scalars for G2 example
delete[] points;
std::cout << "Part II: use G2 points" << std::endl;
std::cout << "Generating random inputs on-host" << std::endl;
// use the same scalars
g2_affine_t* g2_points = new g2_affine_t[N];
g2_projective_t::rand_host_many_affine(g2_points, N);
std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
config.are_results_on_device = false;
config.are_scalars_on_device = false;
config.are_points_on_device = false;
g2_projective_t g2_result;
bn254_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
std::cout << "Copying inputs on-device" << std::endl;
g2_affine_t* g2_points_d;
g2_projective_t* g2_result_d;
cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);
std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
config.are_results_on_device = true;
config.are_scalars_on_device = true;
config.are_points_on_device = true;
std::cout << "Running MSM kernel with on-device inputs" << std::endl;
bn254_g2_msm_cuda(scalars_d, g2_points_d, msm_size, config, g2_result_d);
cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
cudaFree(scalars_d);
cudaFree(g2_points_d);
cudaFree(g2_result_d);
delete[] g2_points;
delete[] scalars;
cudaStreamDestroy(stream);
return 0;
}

View File

@@ -1,2 +1,65 @@
#!/bin/bash
./build/example/example
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_BACKEND_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_BACKEND_DIR}" ]; then
echo "Building icicle with CUDA backend"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON -S "${ICILE_DIR}" -B build/icicle
fi
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
cmake --build build/icicle -j
cmake --build build/example -j
./build/example/example "$DEVICE_TYPE" "$BACKEND_INSTALL_DIR"

View File

@@ -1,15 +1,18 @@
#!/bin/bash
# Exit immediately on error
set -e
# TODO update for V3
mkdir -p build/example
mkdir -p build/icicle
# # Exit immediately on error
# set -e
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle
# mkdir -p build/example
# mkdir -p build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example
# # Configure and build Icicle
# cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
# cmake --build build/icicle
# # Configure and build the example application
# cmake -S . -B build/example
# cmake --build build/example

View File

@@ -6,6 +6,9 @@
#include "api/bn254.h"
#include "gpu-utils/error_handler.cuh"
#include "poseidon/poseidon.cuh"
#include "hash/hash.cuh"
using namespace poseidon;
using namespace bn254;
@@ -20,31 +23,20 @@ void checkCudaError(cudaError_t error)
// these global constants go into template calls
const int size_col = 11;
// this function executes the Poseidon thread
void threadPoseidon(
device_context::DeviceContext ctx,
unsigned size_partition,
scalar_t* layers,
scalar_t* column_hashes,
PoseidonConstants<scalar_t>* constants)
Poseidon<scalar_t>* poseidon)
{
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
// CHK_IF_RETURN(); I can't use it in a standard thread function
PoseidonConfig column_config = {
ctx, // ctx
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // input_is_a_state
false, // aligned
false, // loop_state
false, // is_async
};
cudaError_t err =
bn254_poseidon_hash_cuda(layers, column_hashes, (size_t)size_partition, size_col, *constants, column_config);
HashConfig column_config = default_hash_config(ctx);
cudaError_t err = poseidon->hash_many(layers, column_hashes, (size_t)size_partition, size_col, 1, column_config);
checkCudaError(err);
}
@@ -59,6 +51,12 @@ using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::p
exit(EXIT_FAILURE); \
}
#define CHECK_ALLOC(ptr) \
if ((ptr) == nullptr) { \
std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
exit(EXIT_FAILURE); \
}
int main()
{
const unsigned size_row = (1 << 30);
@@ -116,19 +114,18 @@ int main()
scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
CHECK_ALLOC(column_hash1);
PoseidonConstants<scalar_t> column_constants0, column_constants1;
bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
Poseidon<scalar_t> column_poseidon0(size_col, ctx0);
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
Poseidon<scalar_t> column_poseidon1(size_col, ctx1);
std::cout << "Parallel execution of Poseidon threads" << std::endl;
START_TIMER(parallel);
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_poseidon1);
// Wait for the threads to finish
thread0.join();
@@ -141,9 +138,9 @@ int main()
std::cout << "Sequential execution of Poseidon threads" << std::endl;
START_TIMER(sequential);
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
thread2.join();
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_poseidon0);
thread3.join();
END_TIMER(sequential, "1 GPU");
std::cout << "Output Data from Thread 2: ";

View File

@@ -1,2 +1,4 @@
#!/bin/bash
./build/example/example
# TODO update for V3
# ./build/example/example

View File

@@ -1,23 +0,0 @@
# Make sure NVIDIA Container Toolkit is installed on your host
# Use NVIDIA base image
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
# Update and install dependencies
RUN apt-get update && apt-get install -y \
nsight-systems-12.2 \
cmake \
protobuf-compiler \
curl \
build-essential \
git \
&& rm -rf /var/lib/apt/lists/*
# Clone Icicle from a GitHub repository
RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle
# Set the working directory in the container
WORKDIR /icicle-example
# Specify the default command for the container
CMD ["/bin/bash"]

View File

@@ -1,24 +0,0 @@
{
"name": "Icicle Examples - Multiply",
"build": {
"dockerfile": "Dockerfile"
},
"workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-example,type=bind",
"workspaceFolder": "/icicle-example",
"runArgs": [
"--gpus",
"all"
],
"postCreateCommand": [
"nvidia-smi"
],
"customizations": {
"vscode": {
"extensions": [
"ms-vscode.cmake-tools",
"ms-azuretools.vscode-docker",
"ms-vscode.cpptools-extension-pack"
]
}
}
}

View File

@@ -1,25 +0,0 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
add_executable(
example
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -1,41 +0,0 @@
# Icicle example: Multiplication
## Best-Practices
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
## Key-Takeaway
`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
## Concise Usage Explanation
Define a `CURVE_ID` and include curve configuration header:
```c++
#define CURVE_ID 1
#include "curves/curve_config.cuh"
```
The values of `CURVE_ID` for different curves are in the above header. Multiplication is accelerated both for field scalars and point fields.
```c++
using namespace curve_config;
scalar_t a;
point_field_t b;
```
## Running the example
- `cd` to your example directory
- compile with `./compile.sh`
- run with `./run.sh`
## What's in the example
1. Define the parameters for the example such as vector size
2. Generate random vectors on-host
3. Copy them on-device
4. Execute element-wise vector multiplication on-device
5. Copy results on-host

View File

@@ -1,15 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -1,162 +0,0 @@
#include <iostream>
#include <iomanip>
#include <chrono>
#include <nvml.h>
#include "api/bn254.h"
#include "vec_ops/vec_ops.cuh"
using namespace vec_ops;
using namespace bn254;
typedef scalar_t T;
int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
{
vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
config.is_a_on_device = true;
config.is_b_on_device = true;
config.is_result_on_device = true;
cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
if (err != cudaSuccess) {
std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
return 0;
}
return 0;
}
int main(int argc, char** argv)
{
const unsigned vector_size = 1 << 15;
const unsigned repetitions = 1 << 15;
cudaError_t err;
nvmlInit();
nvmlDevice_t device;
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
std::cout << "Icicle-Examples: vector multiplications" << std::endl;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
std::cout << "GPU Model: " << name << std::endl;
} else {
std::cerr << "Failed to get GPU model name." << std::endl;
}
unsigned power_limit;
nvmlDeviceGetPowerManagementLimit(device, &power_limit);
std::cout << "Vector size: " << vector_size << std::endl;
std::cout << "Repetitions: " << repetitions << std::endl;
std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
unsigned int baseline_power;
nvmlDeviceGetPowerUsage(device, &baseline_power);
std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
unsigned baseline_temperature;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
} else {
std::cerr << "Failed to get GPU temperature." << std::endl;
}
// host data
T* host_in1 = (T*)malloc(vector_size * sizeof(T));
T* host_in2 = (T*)malloc(vector_size * sizeof(T));
std::cout << "Initializing vectors with random data" << std::endl;
T::rand_host_many(host_in1, vector_size);
T::rand_host_many(host_in2, vector_size);
// device data
device_context::DeviceContext ctx = device_context::get_default_device_context();
T* device_in1;
T* device_in2;
T* device_out;
err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
if (err != cudaSuccess) {
std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
return 0;
}
// copy from host to device
err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
}
err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
return 0;
}
std::cout << "Starting warm-up" << std::endl;
// Warm-up loop
for (int i = 0; i < repetitions; i++) {
vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
}
std::cout << "Starting benchmarking" << std::endl;
unsigned power_before;
nvmlDeviceGetPowerUsage(device, &power_before);
std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
<< " %" << std::endl;
unsigned temperature_before;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
} else {
std::cerr << "Failed to get GPU temperature." << std::endl;
}
auto start_time = std::chrono::high_resolution_clock::now();
// Benchmark loop
for (int i = 0; i < repetitions; i++) {
vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
unsigned power_after;
nvmlDeviceGetPowerUsage(device, &power_after);
std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
<< " %" << std::endl;
unsigned temperature_after;
if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
} else {
std::cerr << "Failed to get GPU temperature." << std::endl;
}
// Report performance in GMPS: Giga Multiplications Per Second
double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
// Optional: validate multiplication
T* host_out = (T*)malloc(vector_size * sizeof(T));
cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
// validate multiplication here...
// clean up and exit
free(host_in1);
free(host_in2);
free(host_out);
cudaFree(device_in1);
cudaFree(device_in2);
cudaFree(device_out);
nvmlShutdown();
return 0;
}

View File

@@ -1,2 +0,0 @@
#!/bin/bash
./build/example/example

View File

@@ -1,23 +1,16 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
project(example)
add_executable(example example.cpp)
target_include_directories(example PRIVATE "../../../icicle_v3/include" "..")
target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
message("${CMAKE_BINARY_DIR}/icicle")
target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
if(BACKEND_DIR)
add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
endif()
add_executable(
example
example.cu
)
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -1,33 +1,35 @@
# Icicle example: Number-Theoretical Transform (NTT)
## Best-Practices
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
## Key-Takeaway
`Icicle` provides CUDA C++ template function NTT for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform.
## Concise Usage Explanation
1. Include the curve api
2. Init NTT domain
3. Call ntt api
```c++
// Select the curve
#define CURVE_ID 1
// Include NTT template
#include "appUtils/ntt/ntt.cu"
using namespace curve_config;
using namespace ntt;
// Configure NTT
NTTConfig<S> config=DefaultNTTConfig<S>();
// Call NTT
NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
#include "icicle/api/bn254.h"
...
auto ntt_init_domain_cfg = default_ntt_init_domain_config();
...
bn254_ntt_init_domain(&basic_root, ntt_init_domain_cfg);
NTTConfig<scalar_t> config = default_ntt_config<scalar_t>();
...
bn254_ntt(input.get(), ntt_size, NTTDir::kForward, config, output.get())
```
## Running the example
- `cd` to your example directory
- compile with `./compile.sh`
- run with `./run.sh`
```sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
## What's in the example

View File

@@ -1,15 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -0,0 +1,104 @@
#include <iostream>
#include "icicle/runtime.h"
#include "icicle/api/bn254.h"
using namespace bn254;
#include "examples_utils.h"
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, scalar_t* elements);
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, scalar_t* elements);
int main(int argc, char* argv[])
{
try_load_and_set_backend_device(argc, argv);
std::cout << "\nIcicle Examples: Number Theoretical Transform (NTT)" << std::endl;
const unsigned log_ntt_size = 20;
const unsigned ntt_size = 1 << log_ntt_size;
const unsigned batch_size = 2;
std::cout << "Example parameters:" << std::endl;
std::cout << "NTT size: " << ntt_size << std::endl;
std::cout << "batch size: " << batch_size << std::endl;
std::cout << "\nGenerating input data for lowest and highest harmonics" << std::endl;
auto input = std::make_unique<scalar_t[]>(batch_size * ntt_size);
auto output = std::make_unique<scalar_t[]>(batch_size * ntt_size);
initialize_input(ntt_size, batch_size, input.get());
// Initialize NTT domain
std::cout << "\nInit NTT domain" << std::endl;
scalar_t basic_root = scalar_t::omega(log_ntt_size /*NTT_LOG_SIZscalar_t*/);
auto ntt_init_domain_cfg = default_ntt_init_domain_config();
ConfigExtension backend_cfg_ext;
backend_cfg_ext.set("fast_twiddles", true); // optionally construct fast_twiddles for CUDA backend
ntt_init_domain_cfg.ext = &backend_cfg_ext;
ICICLE_CHECK(bn254_ntt_init_domain(&basic_root, ntt_init_domain_cfg));
// ntt configuration
NTTConfig<scalar_t> config = default_ntt_config<scalar_t>();
ConfigExtension ntt_cfg_ext;
config.ext = &ntt_cfg_ext;
config.batch_size = batch_size;
// warmup
ICICLE_CHECK(bn254_ntt(input.get(), ntt_size, NTTDir::kForward, config, output.get()));
// NTT radix-2 alg
std::cout << "\nRunning NTT radix-2 alg with on-host data" << std::endl;
ntt_cfg_ext.set("ntt_algorithm", 1); // radix-2
START_TIMER(Radix2);
ICICLE_CHECK(bn254_ntt(input.get(), ntt_size, NTTDir::kForward, config, output.get()));
END_TIMER(Radix2, "Radix2 NTT");
std::cout << "Validating output" << std::endl;
validate_output(ntt_size, batch_size, output.get());
// NTT mixed-radix alg
std::cout << "\nRunning NTT mixed-radix alg with on-host data" << std::endl;
ntt_cfg_ext.set("ntt_algorithm", 2); // mixed-radix
START_TIMER(MixedRadix);
ICICLE_CHECK(bn254_ntt(input.get(), ntt_size, NTTDir::kForward, config, output.get()));
END_TIMER(MixedRadix, "MixedRadix NTT");
std::cout << "Validating output" << std::endl;
validate_output(ntt_size, batch_size, output.get());
return 0;
}
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, scalar_t* elements)
{
// Lowest Harmonics
for (unsigned i = 0; i < ntt_size; i = i + 1) {
elements[i] = scalar_t::one();
}
// Highest Harmonics
for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
elements[i] = scalar_t::one();
elements[i + 1] = scalar_t::neg(scalar_t::one());
}
}
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, scalar_t* elements)
{
int nof_errors = 0;
scalar_t amplitude = scalar_t::from((uint32_t)ntt_size);
// Lowest Harmonics
if (elements[0] != amplitude) {
++nof_errors;
std::cout << "Error in lowest harmonicscalar_t 0! " << std::endl;
} else {
std::cout << "Validated lowest harmonics" << std::endl;
}
// Highest Harmonics
if (elements[1 * ntt_size + ntt_size / 2] != amplitude) {
++nof_errors;
std::cout << "Error in highest harmonics! " << std::endl;
} else {
std::cout << "Validated highest harmonics" << std::endl;
}
return nof_errors;
}

View File

@@ -1,113 +0,0 @@
#include <chrono>
#include <iostream>
// include NTT template
#include "curves/params/bn254.cuh"
#include "api/bn254.h"
using namespace bn254;
using namespace ntt;
// Operate on scalars
typedef scalar_t S;
typedef scalar_t E;
void print_elements(const unsigned n, E* elements)
{
for (unsigned i = 0; i < n; i++) {
std::cout << i << ": " << elements[i] << std::endl;
}
}
void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
{
// Lowest Harmonics
for (unsigned i = 0; i < ntt_size; i = i + 1) {
elements[i] = E::one();
}
// print_elements(ntt_size, elements );
// Highest Harmonics
for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
elements[i] = E::one();
elements[i + 1] = E::neg(scalar_t::one());
}
// print_elements(ntt_size, &elements[1*ntt_size] );
}
int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
{
int nof_errors = 0;
E amplitude = E::from((uint32_t)ntt_size);
// std::cout << "Amplitude: " << amplitude << std::endl;
// Lowest Harmonics
if (elements[0] != amplitude) {
++nof_errors;
std::cout << "Error in lowest harmonics 0! " << std::endl;
// print_elements(ntt_size, elements );
} else {
std::cout << "Validated lowest harmonics" << std::endl;
}
// Highest Harmonics
if (elements[1 * ntt_size + ntt_size / 2] != amplitude) {
++nof_errors;
std::cout << "Error in highest harmonics! " << std::endl;
// print_elements(ntt_size, &elements[1*ntt_size] );
} else {
std::cout << "Validated highest harmonics" << std::endl;
}
return nof_errors;
}
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char* argv[])
{
std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
std::cout << "Example parameters" << std::endl;
const unsigned log_ntt_size = 20;
std::cout << "Log2(NTT size): " << log_ntt_size << std::endl;
const unsigned ntt_size = 1 << log_ntt_size;
std::cout << "NTT size: " << ntt_size << std::endl;
const unsigned nof_ntts = 2;
std::cout << "Number of NTTs: " << nof_ntts << std::endl;
const unsigned batch_size = nof_ntts * ntt_size;
std::cout << "Generating input data for lowest and highest harmonics" << std::endl;
E* input;
input = (E*)malloc(sizeof(E) * batch_size);
initialize_input(ntt_size, nof_ntts, input);
E* output;
output = (E*)malloc(sizeof(E) * batch_size);
std::cout << "Running NTT with on-host data" << std::endl;
// Create a device context
auto ctx = device_context::get_default_device_context();
S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
bn254_initialize_domain(&basic_root, ctx, true);
// Create an NTTConfig instance
NTTConfig<S> config = default_ntt_config<S>();
config.ntt_algorithm = NttAlgorithm::MixedRadix;
config.batch_size = nof_ntts;
START_TIMER(MixedRadix);
cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
END_TIMER(MixedRadix, "MixedRadix NTT");
std::cout << "Validating output" << std::endl;
validate_output(ntt_size, nof_ntts, output);
config.ntt_algorithm = NttAlgorithm::Radix2;
START_TIMER(Radix2);
err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
END_TIMER(Radix2, "Radix2 NTT");
std::cout << "Validating output" << std::endl;
validate_output(ntt_size, nof_ntts, output);
std::cout << "Cleaning-up memory" << std::endl;
free(input);
free(output);
return 0;
}

View File

@@ -1,2 +1,65 @@
#!/bin/bash
./build/example/example
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_BACKEND_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_BACKEND_DIR}" ]; then
echo "Building icicle with CUDA backend"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -S "${ICILE_DIR}" -B build/icicle
fi
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
cmake --build build/icicle -j
cmake --build build/example -j
./build/example/example "$DEVICE_TYPE" "$BACKEND_INSTALL_DIR"

View File

@@ -1,26 +1,16 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
add_executable(
example
example.cu
)
project(example)
add_executable(example example.cpp)
target_include_directories(example PRIVATE "../../../icicle_v3/include" "..")
target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
message("${CMAKE_BINARY_DIR}/icicle")
target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
if(BACKEND_DIR)
add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
endif()
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -1,9 +1,5 @@
# ICICLE example: Pedersen Commitment
## Best-Practices
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
## Key-Takeaway
A Pedersen Commitment is a cryptographic primitive to commit to a value or a vector of values while keeping it hidden, yet enabling the committer to reveal the value later. It provides both hiding (the commitment does not reveal any information about the value) and binding properties (once a value is committed, it cannot be changed without detection).
@@ -14,10 +10,12 @@ An example of MSM is [here](../msm/README.md).
## Running the example
- `cd` to your example directory
- compile with `./compile.sh`
- run with `./run.sh`
```sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
## Concise Explanation
We recommend this simple [explanation](https://www.rareskills.io/post/pedersen-commitment).

View File

@@ -1,15 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -1,16 +1,16 @@
#include <iostream>
#include <iomanip>
#include <chrono>
#include <cassert>
#include <nvml.h>
#include "api/bn254.h"
#include "msm/msm.cuh"
#include "icicle/runtime.h"
#include "icicle/api/bn254.h"
#include "icicle/curves/params/bn254.h"
using namespace bn254;
typedef point_field_t T;
#include "examples_utils.h"
// modular power
template <typename T>
T modPow(T base, T exp)
{
T r = T::one();
@@ -27,10 +27,15 @@ T modPow(T base, T exp)
}
// Check if y2 is a quadratic residue using Euler's Criterion
bool quadratic_residue(T y2) { return modPow(y2, T::div2(T::zero() - T::one())) == T::one(); }
template <typename T>
bool quadratic_residue(T y2)
{
return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
}
// modular square root adapted from:
// https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
template <typename T>
bool mySQRT(T a, T* result)
{
if (a == T::zero()) {
@@ -72,9 +77,10 @@ bool mySQRT(T a, T* result)
}
}
template <typename T>
void point_near_x(T x, affine_t* point)
{
const T wb = T{weierstrass_b};
const T wb = T{G1::weierstrass_b};
T y2;
while (y2 = x * x * x + wb, quadratic_residue(y2) == false) {
x = x + T::one();
@@ -87,7 +93,8 @@ void point_near_x(T x, affine_t* point)
}
static int seed = 0;
static HOST_INLINE T rand_host_seed()
template <typename T>
static T rand_host_seed()
{
std::mt19937_64 generator(seed++);
std::uniform_int_distribution<unsigned> distribution;
@@ -101,16 +108,13 @@ static HOST_INLINE T rand_host_seed()
return value;
}
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char** argv)
{
try_load_and_set_backend_device(argc, argv);
const unsigned N = pow(2, 10);
std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
T* xs = new T[N + 1];
point_field_t* xs = new point_field_t[N + 1];
std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment"
@@ -119,9 +123,8 @@ int main(int argc, char** argv)
std::cout << "Using seed: " << seed << std::endl;
std::cout << "Generating random field values" << std::endl;
START_TIMER(gen);
for (unsigned i = 0; i < N; i++) {
xs[i] = rand_host_seed();
xs[i] = rand_host_seed<point_field_t>();
}
END_TIMER(gen, "Time to generate field values");
std::cout << "xs[0]: " << xs[0] << std::endl;
@@ -145,9 +148,9 @@ int main(int argc, char** argv)
scalars[N] = scalar_t::rand_host();
std::cout << "Executing MSM" << std::endl;
auto config = msm::default_msm_config();
auto config = default_msm_config();
START_TIMER(msm);
bn254_msm_cuda(scalars, points, N + 1, config, &result);
bn254_msm(scalars, points, N + 1, config, &result);
END_TIMER(msm, "Time to execute MSM");
std::cout << "Computed commitment: " << result << std::endl;

View File

@@ -1,2 +1,65 @@
#!/bin/bash
./build/example/example
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_BACKEND_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_BACKEND_DIR}" ]; then
echo "Building icicle with CUDA backend"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle
fi
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
cmake --build build/icicle -j
cmake --build build/example -j
./build/example/example "$DEVICE_TYPE" "$BACKEND_INSTALL_DIR"

View File

@@ -1,30 +1,16 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -DCURVE_ID=BN254")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
project(example)
add_executable(
example
example.cu
)
add_executable(example example.cpp)
target_include_directories(example PRIVATE "../../../icicle_v3/include" "..")
target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
message("${CMAKE_BINARY_DIR}/icicle")
target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
if(BACKEND_DIR)
add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
endif()
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_include_directories(example PRIVATE "../../../icicle/include")
# can link to another curve/field by changing the following lib and FIELD_ID
target_link_libraries(example
${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a
${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a
)
target_compile_definitions(example PUBLIC FIELD_ID BN254)

View File

@@ -1,9 +1,5 @@
# ICICLE examples: computations with polynomials
## Best-Practices
We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
## Key-Takeaway
Polynomials are crucial for Zero-Knowledge Proofs (ZKPs): they enable efficient representation and verification of computational statements, facilitate privacy-preserving protocols, and support complex mathematical operations essential for constructing and verifying proofs without revealing underlying data. Polynomial API is documented [here](https://dev.ingonyama.com/icicle/polynomials/overview)
@@ -13,9 +9,10 @@ Polynomials are crucial for Zero-Knowledge Proofs (ZKPs): they enable efficient
To run example, from project root directory:
```sh
cd examples/c++/polynomial-api
./compile.sh
./run.sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
To change the scalar field, modify `compile.h` to build the corresponding lib and `CMakeLists.txt` to link to that lib and set `FIELD_ID` correspondingly.

View File

@@ -1,15 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -1,16 +1,13 @@
#include <iostream>
#include <cassert>
#include "polynomials/polynomials.h"
#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
#include "ntt/ntt.cuh"
#include "poseidon/tree/merkle.cuh"
#include "api/bn254.h"
#include <chrono>
// using namespace field_config;
using namespace polynomials;
using namespace merkle;
using namespace bn254;
#include "icicle/api/bn254.h"
#include "icicle/polynomials/polynomials.h"
#include "examples_utils.h"
using namespace icicle;
using namespace bn254; // typedef scalar_t as bn254-scalar type
// define the polynomial type
typedef Polynomial<scalar_t> Polynomial_t;
@@ -24,27 +21,28 @@ const auto four = scalar_t::from(4);
const auto five = scalar_t::from(5);
const auto minus_one = zero - one;
static std::unique_ptr<scalar_t[]> generate_pows(scalar_t tau, uint32_t size){
auto vec = std::make_unique<scalar_t[]>(size);
vec[0] = scalar_t::one();
for (size_t i = 1; i < size; ++i) {
vec[i] = vec[i-1] * tau;
static std::unique_ptr<scalar_t[]> generate_pows(scalar_t tau, uint32_t size)
{
auto vec = std::make_unique<scalar_t[]>(size);
vec[0] = scalar_t::one();
for (size_t i = 1; i < size; ++i) {
vec[i] = vec[i - 1] * tau;
}
return std::move(vec);
}
static std::unique_ptr<affine_t[]> generate_SRS(uint32_t size) {
static std::unique_ptr<affine_t[]> generate_SRS(uint32_t size)
{
auto secret_scalar = scalar_t::rand_host();
auto gen = projective_t::generator();
auto pows_of_tau = generate_pows(secret_scalar,size);
auto pows_of_tau = generate_pows(secret_scalar, size);
auto SRS = std::make_unique<affine_t[]>(size);
for (size_t i = 0; i < size; ++i) {
SRS[i] = projective_t::to_affine(pows_of_tau[i] * gen);
SRS[i] = projective_t::to_affine(pows_of_tau[i] * gen);
}
return std::move(SRS);
}
void example_evaluate()
{
std::cout << std::endl << "Example: Polynomial evaluation on random value" << std::endl;
@@ -312,122 +310,120 @@ void example_device_memory_view()
const int log_size = 6;
const int size = 1 << log_size;
auto f = randomize_polynomial(size);
auto [d_coeffs, N, device_id] = f.get_coefficients_view();
auto [d_coeffs, N] = f.get_coefficients_view();
// compute coset evaluations
auto coset_evals = std::make_unique<scalar_t[]>(size);
auto ntt_config = ntt::default_ntt_config<scalar_t>();
auto ntt_config = default_ntt_config<scalar_t>();
ntt_config.are_inputs_on_device = true; // using the device data directly as a view
ntt_config.coset_gen = ntt::get_root_of_unity<scalar_t>(size * 2);
ntt::ntt(d_coeffs.get(), size, ntt::NTTDir::kForward, ntt_config, coset_evals.get());
ntt_config.coset_gen = get_root_of_unity<scalar_t>(size * 2);
ntt(d_coeffs.get(), size, NTTDir::kForward, ntt_config, coset_evals.get());
}
void example_commit_with_device_memory_view()
{
//declare time vars
// declare time vars
std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
std::chrono::milliseconds duration;
std::cout << std::endl << "Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1" << std::endl;
std::cout<< "Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1" << std::endl;
std::cout << std::endl
<< "Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1" << std::endl;
std::cout << "Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1" << std::endl;
int N = 1025;
//generate group elements string of length N: (1, beta,beta^2....,beta^{N-1}). g
// generate group elements string of length N: (1, beta,beta^2....,beta^{N-1}). g
std::cout << "Setup: Generating mock SRS" << std::endl;
start = std::chrono::high_resolution_clock::now();
auto SRS = generate_SRS(2*N);
//Allocate memory on device (points)
auto SRS = generate_SRS(2 * N);
// Allocate memory on device (points)
affine_t* points_d;
cudaMalloc(&points_d, sizeof(affine_t)* 2 * N);
ICICLE_CHECK(icicle_malloc((void**)&points_d, sizeof(affine_t) * 2 * N));
// copy SRS to device (could have generated on device, but gives an indicator)
cudaMemcpy(points_d, SRS.get(), sizeof(affine_t)* 2 * N, cudaMemcpyHostToDevice);
ICICLE_CHECK(icicle_copy(points_d, SRS.get(), sizeof(affine_t) * 2 * N));
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Setup: SRS of length "<< N << " generated and loaded to device. Took: " << duration.count() << " milliseconds" << std::endl;
//goal:
//test commitment equality [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1
//test commitment equality [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1
//note: using polyapi to gen scalars: already on device.
std::cout << "Setup: Generating polys (on device) f1,f2 of log degree " << log2(N-1) << std::endl;
std::cout << "Setup: SRS of length " << N << " generated and loaded to device. Took: " << duration.count()
<< " milliseconds" << std::endl;
// goal:
// test commitment equality [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1
// test commitment equality [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1
// note: using polyapi to gen scalars: already on device.
std::cout << "Setup: Generating polys (on device) f1,f2 of log degree " << log2(N - 1) << std::endl;
start = std::chrono::high_resolution_clock::now();
auto f1 = randomize_polynomial(N);
auto f2 = randomize_polynomial(N);
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Setup: Gen poly done. Took: " << duration.count() << " milliseconds" << std::endl;
//deg 2N constraints (f1+f2)^2 + (f1-f2)^2 = 2 (f1^2+ f_2^2)
std::cout << "Computing constraints..start "<< std::endl;
// deg 2N constraints (f1+f2)^2 + (f1-f2)^2 = 2 (f1^2+ f_2^2)
std::cout << "Computing constraints..start " << std::endl;
start = std::chrono::high_resolution_clock::now();
auto L1 = (f1+f2)*(f1+f2) + (f1-f2)*(f1-f2);
auto R1 = scalar_t::from(2) * (f1*f1 + f2*f2);
//deg 2N constraints (f1+f2)^2 - (f1-f2)^2 = 4 f1 *f_2
auto L2 = (f1+f2)*(f1+f2) - (f1-f2)*(f1-f2);
auto L1 = (f1 + f2) * (f1 + f2) + (f1 - f2) * (f1 - f2);
auto R1 = scalar_t::from(2) * (f1 * f1 + f2 * f2);
// deg 2N constraints (f1+f2)^2 - (f1-f2)^2 = 4 f1 *f_2
auto L2 = (f1 + f2) * (f1 + f2) - (f1 - f2) * (f1 - f2);
auto R2 = scalar_t::from(4) * f1 * f2;
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Computing constraints..done. Took: " << duration.count() << " milliseconds"<< std::endl;
std::cout << "Computing constraints..done. Took: " << duration.count() << " milliseconds" << std::endl;
// extract coeff using coeff view
auto [viewL1, sizeL1, device_idL1] = L1.get_coefficients_view();
auto [viewL2, sizeL2, device_idL2] = L2.get_coefficients_view();
auto [viewR1, sizeR1, device_idR1] = R1.get_coefficients_view();
auto [viewR2, sizeR2, device_idR2] = R2.get_coefficients_view();
std::cout << "Computing Commitments with poly view"<< std::endl;
auto [viewL1, sizeL1] = L1.get_coefficients_view();
auto [viewL2, sizeL2] = L2.get_coefficients_view();
auto [viewR1, sizeR1] = R1.get_coefficients_view();
auto [viewR2, sizeR2] = R2.get_coefficients_view();
std::cout << "Computing Commitments with poly view" << std::endl;
start = std::chrono::high_resolution_clock::now();
msm::MSMConfig config = msm::default_msm_config();
MSMConfig config = default_msm_config();
config.are_points_on_device = true;
config.are_scalars_on_device = true;
//host vars (for result)
// host vars (for result)
projective_t hL1{}, hL2{}, hR1{}, hR2{};
//straightforward msm bn254 api: no batching
bn254_msm_cuda(viewL1.get(),points_d,N,config,&hL1);
bn254_msm_cuda(viewL2.get(),points_d,N,config,&hL2);
bn254_msm_cuda(viewR1.get(),points_d,N,config,&hR1);
bn254_msm_cuda(viewR2.get(),points_d,N,config,&hR2);
// straightforward msm bn254 api: no batching
msm(viewL1.get(), points_d, N, config, &hL1);
msm(viewL2.get(), points_d, N, config, &hL2);
msm(viewR1.get(), points_d, N, config, &hR1);
msm(viewR2.get(), points_d, N, config, &hR2);
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Commitments done. Took: " << duration.count() << " milliseconds"<< std::endl;
//sanity checks
std::cout << "Commitments done. Took: " << duration.count() << " milliseconds" << std::endl;
// sanity checks
auto affL1 = projective_t::to_affine(hL1);
auto affR1 = projective_t::to_affine(hR1);
auto affL2 = projective_t::to_affine(hL2);
auto affR2 = projective_t::to_affine(hR2);
//test commitment equality [(f1+f2)^2 + (f1-f2)^2]_1 = [4 (f_1^2+f_2^2]_1
assert(affL1.x==affR1.x && affL1.y==affR1.y);
std::cout << "commitment [(f1+f2)^2 + (f1-f2)^2]_1:" << std::endl;
// test commitment equality [(f1+f2)^2 + (f1-f2)^2]_1 = [4 (f_1^2+f_2^2]_1
assert(affL1.x == affR1.x && affL1.y == affR1.y);
std::cout << "commitment [(f1+f2)^2 + (f1-f2)^2]_1:" << std::endl;
std::cout << "[x: " << affL1.x << ", y: " << affL1.y << "]" << std::endl;
std::cout << "commitment [[2 (f_1^2+f_2^2]_1:" <<std::endl;
std::cout << "commitment [[2 (f_1^2+f_2^2]_1:" << std::endl;
std::cout << "[x: " << affR1.x << ", y: " << affR1.y << "]" << std::endl;
assert(affL2.x==affR2.x && affL2.y==affR2.y);
std::cout << "commitment [(f1+f2)^2 - (f1-f2)^2]_1:"<< std::endl;
assert(affL2.x == affR2.x && affL2.y == affR2.y);
std::cout << "commitment [(f1+f2)^2 - (f1-f2)^2]_1:" << std::endl;
std::cout << "[x: " << affL2.x << ", y: " << affL2.y << "]" << std::endl;
std::cout << "commitment [4 f_1*f_2]_1:"<<std::endl;
std::cout << "commitment [4 f_1*f_2]_1:" << std::endl;
std::cout << "[x: " << affR2.x << ", y: " << affR2.y << "]" << std::endl;
}
int main(int argc, char** argv)
{
// Initialize NTT. TODO: can we hide this in the library?
static const int MAX_NTT_LOG_SIZE = 24;
auto ntt_config = ntt::default_ntt_config<scalar_t>();
const scalar_t basic_root = scalar_t::omega(MAX_NTT_LOG_SIZE);
ntt::init_domain(basic_root, ntt_config.ctx);
try_load_and_set_backend_device(argc, argv);
// Virtual factory design pattern: initializing polynomimals factory for CUDA backend
Polynomial_t::initialize(std::make_unique<CUDAPolynomialFactory<>>());
static const int MAX_NTT_LOG_SIZE = 24;
const scalar_t basic_root = scalar_t::omega(MAX_NTT_LOG_SIZE);
ntt_init_domain(basic_root, default_ntt_init_domain_config());
START_TIMER(polyapi);
example_evaluate();
example_clone(10);
@@ -446,5 +442,7 @@ int main(int argc, char** argv)
example_device_memory_view();
example_commit_with_device_memory_view();
END_TIMER(polyapi, "polyapi example took");
return 0;
}

View File

@@ -1,2 +1,65 @@
#!/bin/bash
./build/example/example
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_BACKEND_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_BACKEND_DIR}" ]; then
echo "Building icicle with CUDA backend"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle
fi
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
cmake --build build/icicle -j
cmake --build build/example -j
./build/example/example "$DEVICE_TYPE" "$BACKEND_INSTALL_DIR"

View File

@@ -1,27 +1,16 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(example LANGUAGES CUDA CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
# change the path to your Icicle location
include_directories("../../../icicle")
add_executable(
example
example.cu
)
project(example)
add_executable(example example.cpp)
target_include_directories(example PRIVATE "../../../icicle_v3/include" "..")
target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
message("${CMAKE_BINARY_DIR}/icicle")
target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
if(BACKEND_DIR)
add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
endif()
target_include_directories(example PRIVATE "../../../icicle/include")
target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@@ -0,0 +1,30 @@
# Icicle Example: Polynomial Multiplication with NTT
## Key-Takeaway
Icicle provides polynomial multiplication using the Number Theoretical Transform (NTT), including forward and inverse transforms.
## Concise Usage Explanation
1. Include the necessary headers.
2. Initialize the NTT domain.
3. Prepare and transform the polynomials from host to device memory.
4. Perform pointwise multiplication.
5. Apply the inverse NTT.
## Running the example
```sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
## What's in the example
1. Define the size of the example.
2. Initialize input polynomials.
3. Perform Radix-2 or Mixed-Radix NTT.
4. Perform pointwise polynomial multiplication.
5. Apply the inverse NTT.

View File

@@ -1,15 +0,0 @@
#!/bin/bash
# Exit immediately on error
set -e
mkdir -p build/example
mkdir -p build/icicle
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example

View File

@@ -0,0 +1,102 @@
#include <iostream>
#include <vector>
#include <memory>
#include "icicle/runtime.h"
#include "icicle/api/bn254.h"
using namespace bn254;
#include "examples_utils.h"
void random_samples(scalar_t* res, uint32_t count)
{
for (int i = 0; i < count; i++)
res[i] = i < 1000 ? scalar_t::rand_host() : res[i - 1000];
}
void incremental_values(scalar_t* res, uint32_t count)
{
for (int i = 0; i < count; i++) {
res[i] = i ? res[i - 1] + scalar_t::one() * scalar_t::omega(4) : scalar_t::zero();
}
}
// calcaulting polynomial multiplication A*B via NTT,pointwise-multiplication and INTT
// (1) allocate A,B on HOST. Randomize first half, zero second half
// (2) allocate A,B,Res on device
// (3) calc NTT for A and for B from host to device
// (4) multiply d_polyRes = NttAGpu * NttBGpu (pointwise)
// (5) INTT d_polyRes inplace
int main(int argc, char** argv)
{
try_load_and_set_backend_device(argc, argv);
int NTT_LOG_SIZE = 20;
int NTT_SIZE = 1 << NTT_LOG_SIZE;
// init domain
scalar_t basic_root = scalar_t::omega(NTT_LOG_SIZE);
bn254_ntt_init_domain(&basic_root, default_ntt_init_domain_config());
// (1) cpu allocation
auto polyA = std::make_unique<scalar_t[]>(NTT_SIZE);
auto polyB = std::make_unique<scalar_t[]>(NTT_SIZE);
random_samples(polyA.get(), NTT_SIZE >> 1); // second half zeros
random_samples(polyB.get(), NTT_SIZE >> 1); // second half zeros
scalar_t *d_polyA, *d_polyB, *d_polyRes;
DeviceProperties device_props;
ICICLE_CHECK(icicle_get_device_properties(device_props));
auto benchmark = [&](bool print) {
// (2) device input allocation. If device does not share memory with host, copy inputs explicitly and
ICICLE_CHECK(icicle_malloc((void**)&d_polyA, sizeof(scalar_t) * NTT_SIZE));
ICICLE_CHECK(icicle_malloc((void**)&d_polyB, sizeof(scalar_t) * NTT_SIZE));
ICICLE_CHECK(icicle_malloc((void**)&d_polyRes, sizeof(scalar_t) * NTT_SIZE));
// start recording
START_TIMER(poly_multiply)
// (3) NTT for A,B from host memory to device-memory
auto ntt_config = default_ntt_config<scalar_t>();
ntt_config.are_inputs_on_device = false;
ntt_config.are_outputs_on_device = true;
ntt_config.ordering = Ordering::kNM;
ICICLE_CHECK(bn254_ntt(polyA.get(), NTT_SIZE, NTTDir::kForward, ntt_config, d_polyA));
ICICLE_CHECK(bn254_ntt(polyB.get(), NTT_SIZE, NTTDir::kForward, ntt_config, d_polyB));
// (4) multiply A,B
VecOpsConfig config{
nullptr,
true, // is_a_on_device
true, // is_b_on_device
true, // is_result_on_device
false, // is_async
nullptr // ext
};
ICICLE_CHECK(bn254_vector_mul(d_polyA, d_polyB, NTT_SIZE, config, d_polyRes));
// (5) INTT (in place)
ntt_config.are_inputs_on_device = true;
ntt_config.are_outputs_on_device = true;
ntt_config.ordering = Ordering::kMN;
ICICLE_CHECK(bn254_ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, ntt_config, d_polyRes));
if (print) { END_TIMER(poly_multiply, "polynomial multiplication took"); }
ICICLE_CHECK(icicle_free(d_polyA));
ICICLE_CHECK(icicle_free(d_polyB));
ICICLE_CHECK(icicle_free(d_polyRes));
return eIcicleError::SUCCESS;
};
benchmark(false); // warmup
benchmark(true);
ICICLE_CHECK(bn254_ntt_release_domain());
return 0;
}

View File

@@ -1,119 +0,0 @@
#include <chrono>
#include <iostream>
#include <vector>
#include <memory>
#include "api/bn254.h"
#include "gpu-utils/error_handler.cuh"
using namespace bn254;
typedef scalar_t test_scalar;
typedef scalar_t test_data;
void random_samples(test_data* res, uint32_t count)
{
for (int i = 0; i < count; i++)
res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
}
void incremental_values(test_scalar* res, uint32_t count)
{
for (int i = 0; i < count; i++) {
res[i] = i ? res[i - 1] + test_scalar::one() * test_scalar::omega(4) : test_scalar::zero();
}
}
// calcaulting polynomial multiplication A*B via NTT,pointwise-multiplication and INTT
// (1) allocate A,B on CPU. Randomize first half, zero second half
// (2) allocate NttAGpu, NttBGpu on GPU
// (3) calc NTT for A and for B from cpu to GPU
// (4) multiply MulGpu = NttAGpu * NttBGpu (pointwise)
// (5) INTT MulGpu inplace
int main(int argc, char** argv)
{
cudaEvent_t start, stop;
float measured_time;
int NTT_LOG_SIZE = 23;
int NTT_SIZE = 1 << NTT_LOG_SIZE;
CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context
// init domain
auto ntt_config = ntt::default_ntt_config<test_scalar>();
const bool is_radix2_alg = (argc > 1) ? atoi(argv[1]) : false;
ntt_config.ntt_algorithm = is_radix2_alg ? ntt::NttAlgorithm::Radix2 : ntt::NttAlgorithm::MixedRadix;
const char* ntt_alg_str = is_radix2_alg ? "Radix-2" : "Mixed-Radix";
std::cout << "Polynomial multiplication with " << ntt_alg_str << " NTT: ";
CHK_IF_RETURN(cudaEventCreate(&start));
CHK_IF_RETURN(cudaEventCreate(&stop));
test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
bn254_initialize_domain(&basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);
// (1) cpu allocation
auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
auto CpuB = std::make_unique<test_data[]>(NTT_SIZE);
random_samples(CpuA.get(), NTT_SIZE >> 1); // second half zeros
random_samples(CpuB.get(), NTT_SIZE >> 1); // second half zeros
test_data *GpuA, *GpuB, *MulGpu;
auto benchmark = [&](bool print, int iterations = 1) {
// start recording
CHK_IF_RETURN(cudaEventRecord(start, ntt_config.ctx.stream));
for (int iter = 0; iter < iterations; ++iter) {
// (2) gpu input allocation
CHK_IF_RETURN(cudaMallocAsync(&GpuA, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaMallocAsync(&GpuB, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
// (3) NTT for A,B from cpu to gpu
ntt_config.are_inputs_on_device = false;
ntt_config.are_outputs_on_device = true;
ntt_config.ordering = ntt::Ordering::kNM;
CHK_IF_RETURN(bn254_ntt_cuda(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
CHK_IF_RETURN(bn254_ntt_cuda(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));
// (4) multiply A,B
CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
vec_ops::VecOpsConfig config{
ntt_config.ctx,
true, // is_a_on_device
true, // is_b_on_device
true, // is_result_on_device
false // is_async
};
CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu));
// (5) INTT (in place)
ntt_config.are_inputs_on_device = true;
ntt_config.are_outputs_on_device = true;
ntt_config.ordering = ntt::Ordering::kMN;
CHK_IF_RETURN(bn254_ntt_cuda(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));
CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(MulGpu, ntt_config.ctx.stream));
}
CHK_IF_RETURN(cudaEventRecord(stop, ntt_config.ctx.stream));
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
CHK_IF_RETURN(cudaEventElapsedTime(&measured_time, start, stop));
if (print) { std::cout << measured_time / iterations << " MS" << std::endl; }
return CHK_LAST();
};
benchmark(false); // warmup
benchmark(true, 20);
bn254_release_domain(ntt_config.ctx);
CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
return 0;
}

View File

@@ -1,3 +1,65 @@
#!/bin/bash
./build/example/example 1 # radix2
./build/example/example 0 # mixed-radix
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_BACKEND_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_BACKEND_DIR}" ]; then
echo "Building icicle with CUDA backend"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -S "${ICILE_DIR}" -B build/icicle
fi
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
cmake --build build/icicle -j
cmake --build build/example -j
./build/example/example "$DEVICE_TYPE" "$BACKEND_INSTALL_DIR"

View File

@@ -1,15 +1,17 @@
#!/bin/bash
# #!/bin/bash
# Exit immediately on error
set -e
# TODO update for V3
mkdir -p build/example
mkdir -p build/icicle
# # Exit immediately on error
# set -e
# Configure and build Icicle
cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
cmake --build build/icicle
# mkdir -p build/example
# mkdir -p build/icicle
# Configure and build the example application
cmake -S . -B build/example
cmake --build build/example
# # Configure and build Icicle
# cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
# cmake --build build/icicle
# # Configure and build the example application
# cmake -S . -B build/example
# cmake --build build/example

View File

@@ -4,6 +4,8 @@
#include "api/bn254.h"
#include "curves/params/bn254.cuh"
#include "poseidon/poseidon.cuh"
#include "hash/hash.cuh"
using namespace poseidon;
using namespace bn254;
@@ -14,13 +16,12 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level
// We assume the tree has leaves already set, compute all other levels
void build_tree(
const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t>* constants, PoseidonConfig config)
const uint32_t tree_height, scalar_t* tree, Poseidon<scalar_t> &poseidon, HashConfig &config)
{
for (uint32_t level = tree_height - 1; level > 0; level--) {
const uint32_t next_level = level - 1;
const uint32_t next_level_width = 1 << next_level;
bn254_poseidon_hash_cuda(
&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
poseidon.hash_many(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, 1, config);
}
}
@@ -65,8 +66,8 @@ uint32_t validate_proof(
const uint32_t tree_height,
const uint32_t* proof_lr,
const scalar_t* proof_hash,
PoseidonConstants<scalar_t>* constants,
PoseidonConfig config)
Poseidon<scalar_t> &poseidon,
HashConfig &config)
{
scalar_t hashes_in[2], hash_out[1], level_hash;
level_hash = hash;
@@ -79,7 +80,7 @@ uint32_t validate_proof(
hashes_in[1] = level_hash;
}
// next level hash
bn254_poseidon_hash_cuda(hashes_in, hash_out, 1, 2, *constants, config);
poseidon.hash_many(hashes_in, hash_out, 1, 2, 1, config);
level_hash = hash_out[0];
}
return proof_hash[0] == level_hash;
@@ -109,16 +110,15 @@ int main(int argc, char* argv[])
d = d + scalar_t::one();
}
std::cout << "Hashing blocks into tree leaves..." << std::endl;
PoseidonConstants<scalar_t> constants;
bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
PoseidonConfig config = default_poseidon_config(data_arity + 1);
bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);
Poseidon<scalar_t> poseidon(data_arity, ctx);
HashConfig config = default_hash_config(ctx);
poseidon.hash_many(data, &tree[tree_index(leaf_level, 0)], tree_width, data_arity, 1, config);
std::cout << "3. Building Merkle tree" << std::endl;
PoseidonConstants<scalar_t> tree_constants;
bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
PoseidonConfig tree_config = default_poseidon_config(tree_arity + 1);
build_tree(tree_height, tree, &tree_constants, tree_config);
Poseidon<scalar_t> tree_poseidon(tree_arity, ctx);
HashConfig tree_config = default_hash_config(ctx);
build_tree(tree_height, tree, tree_poseidon, tree_config);
std::cout << "4. Generate membership proof" << std::endl;
uint32_t position = tree_width - 1;
@@ -133,13 +133,13 @@ int main(int argc, char* argv[])
std::cout << "5. Validate the hash membership" << std::endl;
uint32_t validated;
const scalar_t hash = tree[tree_index(leaf_level, query_position)];
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
std::cout << "Validated: " << validated << std::endl;
std::cout << "6. Tamper the hash" << std::endl;
const scalar_t tampered_hash = hash + scalar_t::one();
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
std::cout << "7. Invalidate tamper hash membership" << std::endl;
std::cout << "Validated: " << validated << std::endl;
return 0;

View File

@@ -1,2 +1,4 @@
#!/bin/bash
./build/example/example
# #!/bin/bash
# TODO update for V3
# ./build/example/example

View File

@@ -0,0 +1,16 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
project(example)
add_executable(example example.cpp)
target_include_directories(example PRIVATE "../../../icicle_v3/include" "..")
target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
message("${CMAKE_BINARY_DIR}/icicle")
target_link_libraries(example PRIVATE icicle_field_babybear icicle_device)
if(BACKEND_DIR)
add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
endif()

View File

@@ -0,0 +1,45 @@
# ICICLE example: RISC0's Fibonacci sequence proof using Polynomial API
## Why RISC0?
[RISC0 Protocol](https://www.risczero.com/) creates computational integrity proofs (a.k.a. Zero Knowledge Proofs) for programs executing on RISC-V architecture.
The proofs are created for sequences of values in RISC-V registers, called execution traces.
This approach is transparent to developers and enables the use of general purpose languages.
## Best-Practices
This example builds on [ICICLE Polynomial API](../polynomial-api/README.md) so we recommend to run it first.
## Key-Takeaway
RISC0 encodes execution traces into very large polynomials and commits them using Merkle trees.
FRI speeds-up validation of such commitments by recursively generating smaller polynomials (and trees) from larger ones.
The key enabler for *recursion* is the *redundancy* of polynomial commitments, hence the use of Reed-Solomon codes.
## Running the example
To run example, from project root directory:
```sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
## What's in the example
The example follows [STARK by Hand](https://dev.risczero.com/proof-system/stark-by-hand), structured in the following Lessons:
1. The Execution Trace
2. Rule checks to validate a computation
3. Padding the Trace
4. Constructing Trace Polynomials
5. ZK Commitments of the Trace Data
6. Constraint Polynomials
7. Mixing Constraint Polynomials
8. The Core of the RISC Zero STARK
9. The DEEP Technique
10. Mixing (Batching) for FRI
11. FRI Protocol (Commit Phase)
12. FRI Protocol (Query Phase)

View File

@@ -0,0 +1,293 @@
#include <iostream>
#include <memory>
#include <vector>
#include <list>
#include "examples_utils.h"
#include "icicle/polynomials/polynomials.h"
#include "icicle/api/babybear.h"
using namespace babybear;
// define the polynomial type
typedef Polynomial<scalar_t> Polynomial_t;
// RISC-V register type
typedef int64_t rv_t;
// Convert RISC-V registers to Finite Fields
void to_ff(rv_t* rv, scalar_t* s, size_t n)
{
for (int i = 0; i < n; ++i) {
s[i] = scalar_t::from(rv[i]);
}
}
void p_print(Polynomial_t* p, int logn, scalar_t shift, std::string header = "Print Vector")
{
std::cout << header << std::endl;
auto n = 1 << logn;
auto omega = scalar_t::omega(logn);
auto x = shift;
for (int i = 0; i < n; ++i) {
std::cout << i << ": " << (*p)(x) << std::endl;
x = x * omega;
}
}
// value to polynomial
Polynomial_t p_value(scalar_t value)
{
auto p_value = Polynomial_t::from_coefficients(&value, 1);
return p_value;
}
Polynomial_t p_rotate(Polynomial_t* p, int logn)
{
// rotate polynomial coefficients right by one position
auto n = 1 << logn;
auto evaluations_rou_domain = std::make_unique<scalar_t[]>(n);
p->evaluate_on_rou_domain(logn, evaluations_rou_domain.get());
scalar_t tmp = evaluations_rou_domain[n - 1];
for (int i = n - 1; i > 0; --i) {
evaluations_rou_domain[i] = evaluations_rou_domain[i - 1];
}
evaluations_rou_domain[0] = tmp;
return Polynomial_t::from_rou_evaluations(evaluations_rou_domain.get(), n);
}
// mix polynomials (c.f. mix polynomial evaluations)
Polynomial_t p_mix(Polynomial_t* in[], size_t nmix, scalar_t mix_parameter)
{
scalar_t factor = mix_parameter;
Polynomial_t out = in[0]->clone();
for (int i = 1; i < nmix; ++i) {
out += factor * (*in[i]);
factor = factor * mix_parameter;
}
return out;
}
void solve_linear(scalar_t xa, scalar_t ya, scalar_t xb, scalar_t yb, scalar_t* coeffs)
{
coeffs[1] = (ya - yb) * scalar_t::inverse(xa - xb);
coeffs[0] = ya - coeffs[1] * xa;
}
std::unique_ptr<scalar_t[]> InterpolateOnLargerDomain(Polynomial_t* p, int n, scalar_t shift = scalar_t::one())
{
const int deg = p->degree();
auto input = std::make_unique<scalar_t[]>(n);
// TBD: check if scalar_t constructor initializes to zero
for (int i = 0; i < n; ++i) {
input[i] = scalar_t::zero();
}
p->copy_coeffs(input.get(), 0 /*start*/, deg);
auto ntt_config = default_ntt_config<scalar_t>();
ntt_config.coset_gen = shift;
auto evals_h = std::make_unique<scalar_t[]>(n);
ICICLE_CHECK(ntt(input.get(), n, NTTDir::kForward, ntt_config, evals_h.get()));
return evals_h;
}
int main(int argc, char** argv)
{
try_load_and_set_backend_device(argc, argv);
START_TIMER(risc0_example);
std::cout << "This is an ICICLE C++ implementation of the STARK by Hand Explainer." << std::endl;
std::cout << "https://dev.risczero.com/proof-system/stark-by-hand" << std::endl;
const int logn = 3;
const int n = 1 << logn;
std::cout << "Initializing NTT" << std::endl;
static const int MAX_NTT_LOG_SIZE = 24;
auto ntt_config = default_ntt_config<scalar_t>();
const scalar_t basic_root = scalar_t::omega(MAX_NTT_LOG_SIZE);
ntt_init_domain(basic_root, default_ntt_init_domain_config());
std::cout << std::endl << "Lesson 1: The Execution Trace" << std::endl;
// Trace: Data Columns
rv_t rv_d1_trace[] = {24, 30, 54, 84, 78, 15, 29, 50};
rv_t rv_d2_trace[] = {30, 54, 84, 138, 2, 77, 21, 36};
rv_t rv_d3_trace[] = {54, 84, 138, 222, 71, 17, 92, 33};
auto d1_trace = std::make_unique<scalar_t[]>(n);
auto d2_trace = std::make_unique<scalar_t[]>(n);
auto d3_trace = std::make_unique<scalar_t[]>(n);
to_ff(rv_d1_trace, d1_trace.get(), n);
to_ff(rv_d2_trace, d2_trace.get(), n);
to_ff(rv_d3_trace, d3_trace.get(), n);
// Trace: Control Columns
// Init steps are flagged in c1_trace
// Computation steps are flagged in c2_trace
// Termination step is flagged in c3_trace
// 0s at the end of each control column correspond to the padding of the trace
rv_t rv_c1_trace[] = {1, 0, 0, 0, 0, 0, 0, 0};
rv_t rv_c2_trace[] = {0, 1, 1, 1, 0, 0, 0, 0};
rv_t rv_c3_trace[] = {0, 0, 0, 1, 0, 0, 0, 0};
auto c1_trace = std::make_unique<scalar_t[]>(n);
auto c2_trace = std::make_unique<scalar_t[]>(n);
auto c3_trace = std::make_unique<scalar_t[]>(n);
to_ff(rv_c1_trace, c1_trace.get(), n);
to_ff(rv_c2_trace, c2_trace.get(), n);
to_ff(rv_c3_trace, c3_trace.get(), n);
std::cout << "Lesson 2: Rule checks to validate a computation" << std::endl;
std::cout << "We use rule-checking polynomials." << std::endl;
std::cout << "Lesson 3: Padding the Trace" << std::endl;
// The trace is padded to a power of 2 size to allow for efficient NTT operations.
// we already did this in the initialization of the trace data
// We will construct a zero-knowledge proof that:
// this trace represents a program that satisfies these 6 rules:
// 1) Fibonacci words here
// 2) d1_trace[0] == 24 (init 1 constraint)
// 3) d2_trace[0] == 30 (init 2 constraint)
// 4) d3_trace[3] == 28 (termination constraint)
// 5) if c2_trace[i] == 1, then d2_trace[i] == d1_trace[i+1]
// 6) if c2_trace[i] == 1, then d3_trace[i] == d2_trace[i+1}
std::cout << "Lesson 4: Constructing Trace Polynomials" << std::endl;
auto p_d1 = Polynomial_t::from_rou_evaluations(d1_trace.get(), n);
auto p_d2 = Polynomial_t::from_rou_evaluations(d2_trace.get(), n);
auto p_d3 = Polynomial_t::from_rou_evaluations(d3_trace.get(), n);
auto p_c1 = Polynomial_t::from_rou_evaluations(c1_trace.get(), n);
auto p_c2 = Polynomial_t::from_rou_evaluations(c2_trace.get(), n);
auto p_c3 = Polynomial_t::from_rou_evaluations(c3_trace.get(), n);
std::cout << "Lesson 5: ZK Commitments of the Trace Data" << std::endl;
std::cout << "To maintain a zk protocol, the trace polynomials are evaluated over a zk commitment domain"
<< std::endl;
std::cout << "zk commitment domain is a coset of Reed Solomon domain shifted by a basic root of unity" << std::endl;
scalar_t xzk = basic_root;
p_print(&p_d1, logn, xzk, "ZK commitment for d1 polynomial");
std::cout << "Build Merkle Tree for ZK commitments (outside the scope of this example)" << std::endl;
std::cout << "Lesson 6: Constraint Polynomials" << std::endl;
std::cout << "The constraints are used to check the correctness of the trace. In this example, we check 6 rules to "
"establish the validity of the trace."
<< std::endl;
auto p_fib_constraint = (p_d3 - p_d2 - p_d1) * (p_c1 + p_c2 + p_c3);
auto fib_constraint_zkcommitment = InterpolateOnLargerDomain(&p_fib_constraint, 4 * n, xzk);
auto p_init1_constraint = (p_d1 - p_value(scalar_t::from(24))) * p_c1;
// sanity checks printing
p_print(
&p_init1_constraint, logn + 2, scalar_t::one(), "Reed-Solomon constraint polynomial gives 0s in every 4th row");
p_print(&p_init1_constraint, logn + 2, xzk, "ZK Commitment constraint polynomial gives no 0s");
auto p_init2_constraint = (p_d2 - p_value(scalar_t::from(30))) * p_c1;
auto p_termination_constraint = (p_d3 - p_value(scalar_t::from(222))) * p_c3;
auto p_recursion_constraint1 = (p_d1 - p_rotate(&p_d2, logn)) * p_c2;
auto p_recursion_constraint2 = (p_d2 - p_rotate(&p_d3, logn)) * p_c2;
std::cout << std::endl << "Lesson 7: Mixing Constraint Polynomials" << std::endl;
Polynomial_t* p_all_constraints[] = {&p_fib_constraint, &p_init1_constraint, &p_init2_constraint,
&p_termination_constraint, &p_recursion_constraint1, &p_recursion_constraint2};
const size_t nmix = sizeof(p_all_constraints) / sizeof(p_all_constraints[0]);
auto p_mixed_constraints = p_mix(p_all_constraints, nmix, scalar_t::from(5));
std::cout << "All constraint polynomials are low-degree:" << std::endl;
for (int i = 0; i < nmix; ++i) {
std::cout << i << ": " << p_all_constraints[i]->degree() << std::endl;
}
std::cout << "Lesson 8: The Core of the RISC Zero STARK" << std::endl;
std::cout << "Degree of the mixed constraints polynomial: " << p_mixed_constraints.degree() << std::endl;
auto p_validity = p_mixed_constraints.divide_by_vanishing_polynomial(n);
std::cout << "Degree of the validity polynomial: " << p_validity.degree() << std::endl;
std::cout << "The Verifier should provide the Merke commitment for the above" << std::endl;
std::cout << "Lesson 9: The DEEP Technique" << std::endl;
std::cout
<< "The DEEP technique improves the security of a single query by sampling outside of the commitment domain."
<< std::endl;
// In the original STARK protocol, the Verifier tests validity polynomial at a number of test points;
// the soundness of the protocol depends on the number of tests.
// The DEEP-ALI technique allows us to achieve a high degree of soundness with a single test.
// The details of DEEP are described in the following lesson.
auto DEEP_point = scalar_t::from(93);
std::cout << "The prover convinces the verifier that V=C/Z at the DEEP_test_point, " << DEEP_point << std::endl;
const scalar_t coeffs1[2] = {scalar_t::zero() - DEEP_point, scalar_t::one()};
auto denom_DEEP1 = Polynomial_t::from_coefficients(coeffs1, 2);
auto [p_d1_DEEP, r] = (p_d1 - p_value(DEEP_point)).divide(denom_DEEP1);
std::cout << "The DEEP d1 degree is: " << p_d1_DEEP.degree() << std::endl;
// d2, d3 use recursion constraints and need the point corresponding to the previous state (clock cycle)
auto omega = scalar_t::omega(logn);
auto DEEP_prev_point = DEEP_point * scalar_t::inverse(omega);
auto coeffs2 = std::make_unique<scalar_t[]>(2);
coeffs2[0] = scalar_t::zero() - DEEP_prev_point;
coeffs2[1] = scalar_t::one();
auto denom_DEEP2 = Polynomial_t::from_coefficients(coeffs2.get(), 2);
auto coeffs_d2bar = std::make_unique<scalar_t[]>(2);
solve_linear(DEEP_point, p_d2(DEEP_point), DEEP_prev_point, p_d2(DEEP_prev_point), coeffs_d2bar.get());
auto d2bar = Polynomial_t::from_coefficients(coeffs_d2bar.get(), 2);
auto [p_d2_DEEP, r2] = (p_d2 - d2bar).divide(denom_DEEP1 * denom_DEEP2);
std::cout << "The DEEP d2 degree is: " << p_d2_DEEP.degree() << std::endl;
auto coeffs_d3bar = std::make_unique<scalar_t[]>(2);
solve_linear(DEEP_point, p_d3(DEEP_point), DEEP_prev_point, p_d3(DEEP_prev_point), coeffs_d3bar.get());
auto d3bar = Polynomial_t::from_coefficients(coeffs_d3bar.get(), 2);
auto [p_d3_DEEP, r3] = (p_d3 - d3bar).divide(denom_DEEP1 * denom_DEEP2);
std::cout << "The DEEP d3 degree is: " << p_d3_DEEP.degree() << std::endl;
// DEEP c{1,2,3} polynomials
const scalar_t coeffs_c1bar[1] = {p_c1(DEEP_point)};
auto c1bar = Polynomial_t::from_coefficients(coeffs_c1bar, 1);
auto [p_c1_DEEP, r_c1] = (p_c1 - c1bar).divide(denom_DEEP1);
std::cout << "The DEEP c1 degree is: " << p_c1_DEEP.degree() << std::endl;
const scalar_t coeffs_c2bar[1] = {p_c2(DEEP_point)};
auto c2bar = Polynomial_t::from_coefficients(coeffs_c2bar, 1);
auto [p_c2_DEEP, r_c2] = (p_c2 - c2bar).divide(denom_DEEP1);
std::cout << "The DEEP c2 degree is: " << p_c2_DEEP.degree() << std::endl;
const scalar_t coeffs_c3bar[1] = {p_c3(DEEP_point)};
auto c3bar = Polynomial_t::from_coefficients(coeffs_c3bar, 1);
auto [p_c3_DEEP, r_c3] = (p_c3 - c3bar).divide(denom_DEEP1);
std::cout << "The DEEP c3 degree is: " << p_c3_DEEP.degree() << std::endl;
// DEEP validity polynomial
const scalar_t coeffs_vbar[1] = {p_validity(DEEP_point)};
auto vbar = Polynomial_t::from_coefficients(coeffs_vbar, 1);
auto [v_DEEP, r_v] = (p_validity - vbar).divide(denom_DEEP1);
std::cout << "The DEEP validity polynomial degree is: " << v_DEEP.degree() << std::endl;
std::cout << "The Prover sends DEEP polynomials to the Verifier" << std::endl;
std::cout << "Lesson 10: Mixing (Batching) for FRI" << std::endl;
std::cout << "The initial FRI polynomial is the mix of the 7 DEEP polynomials." << std::endl;
Polynomial_t* all_DEEP[] = {&p_d1_DEEP, &p_d2_DEEP, &p_d3_DEEP, &p_c1_DEEP, &p_c2_DEEP, &p_c3_DEEP, &v_DEEP};
Polynomial_t fri_input = p_mix(all_DEEP, 7, scalar_t::from(99));
std::cout << "The degree of the mixed DEEP polynomial is: " << fri_input.degree() << std::endl;
std::cout << "Lesson 11: FRI Protocol (Commit Phase)" << std::endl;
std::cout << "The prover provides information to convince the verifier that the DEEP polynomials are low-degree."
<< std::endl;
int nof_rounds = 3;
Polynomial_t feven[nof_rounds], fodd[nof_rounds], fri[nof_rounds + 1];
scalar_t rfri[nof_rounds];
fri[0] = fri_input.clone();
for (int i = 0; i < nof_rounds; ++i) {
feven[i] = fri[i].even();
fodd[i] = fri[i].odd();
rfri[i] = scalar_t::rand_host();
fri[i + 1] = feven[i] + rfri[i] * fodd[i];
std::cout << "The degree of the Round " << i << " polynomial is: " << fri[i + 1].degree() << std::endl;
}
std::cout << "Lesson 12: FRI Protocol (Query Phase)" << std::endl;
// We use Polynomial API to evaluate the FRI polynomials
// In practice, verifier will use Merkle commitments
auto xp = scalar_t::rand_host();
auto xm = scalar_t::zero() - xp;
scalar_t lhs[nof_rounds], rhs[nof_rounds];
for (int i = 0; i < nof_rounds; ++i) {
rhs[i] = (rfri[i] + xp) * fri[i](xp) * scalar_t::inverse(scalar_t::from(2) * xp) +
(rfri[i] + xm) * fri[i](xm) * scalar_t::inverse(scalar_t::from(2) * xm);
lhs[i] = fri[i + 1](xp * xp);
std::cout << "Round " << i << std::endl << "rhs: " << rhs[i] << std::endl << "lhs: " << lhs[i] << std::endl;
}
END_TIMER(risc0_example, "risc0 example");
return 0;
}

65
examples/c++/risc0/run.sh Executable file
View File

@@ -0,0 +1,65 @@
#!/bin/bash
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_BACKEND_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_BACKEND_DIR}" ]; then
echo "Building icicle with CUDA backend"
cmake -DCMAKE_BUILD_TYPE=Release -DFIELD=babybear -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cmake -DCMAKE_BUILD_TYPE=Release -DFIELD=babybear -S "${ICILE_DIR}" -B build/icicle
fi
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
cmake --build build/icicle -j
cmake --build build/example -j
./build/example/example "$DEVICE_TYPE" "$BACKEND_INSTALL_DIR"

View File

@@ -4,15 +4,11 @@ version = "1.2.0"
edition = "2018"
[dependencies]
icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
icicle-core = { path = "../../../wrappers/rust/icicle-core" }
icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["g2"] }
icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377" }
ark-bn254 = { version = "0.4.0", optional = true }
ark-bls12-377 = { version = "0.4.0", optional = true }
ark-ec = { version = "0.4.0", optional = true }
icicle-runtime = { path = "../../../wrappers/rust_v3/icicle-runtime" }
icicle-core = { path = "../../../wrappers/rust_v3/icicle-core" }
icicle-bn254 = { path = "../../../wrappers/rust_v3/icicle-curves/icicle-bn254", features = ["g2"] }
icicle-bls12-377 = { path = "../../../wrappers/rust_v3/icicle-curves/icicle-bls12-377" }
clap = { version = "<=4.4.12", features = ["derive"] }
[features]
arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
profile = []
cuda = ["icicle-runtime/cuda_backend", "icicle-bn254/cuda_backend", "icicle-bls12-377/cuda_backend"]

View File

@@ -2,10 +2,6 @@
`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
## Best Practices
In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
## Usage
```rust
@@ -28,29 +24,9 @@ In this example we use `BN254` curve. The function computes $result = \sum_{i=0}
Running the example:
```sh
cargo run --release
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
You can add the `--feature arkworks,profile` flag to measure times of both ICICLE and arkworks.
> [!NOTE]
> The default sizes are 2^19 - 2^23. You can change this by passing the `--lower_bound_log_size <size> --upper_bound_log_size <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
> ```sh
> cargo run --release -- -l 21 -u 24
> ```
## Benchmarks
These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
### Single BN254 MSM
| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
|--------------|------|------|------|------|------|
| ICICLE | 10 ms | 11 ms | 21 ms | 39 ms | 77 ms |
| Arkworks | 284 ms | 540 ms | 1,152 ms | 2,320 ms | 4,491 ms |
### Single BLS12377 MSM
| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
|--------------|------|------|------|------|------|
| ICICLE | 9 ms | 14 ms | 25 ms | 48 ms | 93 ms |
| Arkworks | 490 ms | 918 ms | 1,861 ms | 3,624 ms | 7,191 ms |

59
examples/rust/msm/run.sh Executable file
View File

@@ -0,0 +1,59 @@
#!/bin/bash
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then
echo "Building icicle with CUDA backend"
BACKEND_INSTALL_DIR="./target/release/deps/icicle/lib/backend"
cargo run --release --features=cuda -- --device-type "${DEVICE_TYPE}" --backend-install-dir "${BACKEND_INSTALL_DIR}"
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cargo run --release -- --device-type "${DEVICE_TYPE}" --backend-install-dir "${BACKEND_INSTALL_DIR}"
fi

View File

@@ -1,48 +1,60 @@
use icicle_bn254::curve::{CurveCfg, G1Projective, G2CurveCfg, G2Projective, ScalarCfg};
use icicle_runtime::{
memory::{DeviceVec, HostSlice},
stream::IcicleStream,
};
// Using both bn254 and bls12-377 curves
use icicle_bls12_377::curve::{
CurveCfg as BLS12377CurveCfg, G1Projective as BLS12377G1Projective, ScalarCfg as BLS12377ScalarCfg,
};
use icicle_cuda_runtime::{
memory::{DeviceVec, HostSlice},
stream::CudaStream,
};
use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
#[cfg(feature = "arkworks")]
use icicle_core::traits::ArkConvertible;
#[cfg(feature = "arkworks")]
use ark_bls12_377::{Fr as Bls12377Fr, G1Affine as Bls12377G1Affine, G1Projective as Bls12377ArkG1Projective};
#[cfg(feature = "arkworks")]
use ark_bn254::{Fr as Bn254Fr, G1Affine as Bn254G1Affine, G1Projective as Bn254ArkG1Projective};
#[cfg(feature = "arkworks")]
use ark_ec::scalar_mul::variable_base::VariableBaseMSM;
#[cfg(feature = "profile")]
use std::time::Instant;
use icicle_bn254::curve::{CurveCfg, G1Projective, G2CurveCfg, G2Projective, ScalarCfg};
use clap::Parser;
use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
#[derive(Parser, Debug)]
struct Args {
/// Lower bound (inclusive) of MSM sizes to run for
#[arg(short, long, default_value_t = 19)]
#[arg(short, long, default_value_t = 10)]
lower_bound_log_size: u8,
/// Upper bound of MSM sizes to run for
#[arg(short, long, default_value_t = 22)]
#[arg(short, long, default_value_t = 10)]
upper_bound_log_size: u8,
/// Device type (e.g., "CPU", "CUDA")
#[arg(short, long, default_value = "CPU")]
device_type: String,
/// Backend installation directory
#[arg(short, long, default_value = "")]
backend_install_dir: String,
}
// Load backend and set device
fn try_load_and_set_backend_device(args: &Args) {
if !args
.backend_install_dir
.is_empty()
{
println!("Trying to load backend from {}", &args.backend_install_dir);
icicle_runtime::runtime::load_backend(&args.backend_install_dir).unwrap();
}
println!("Setting device {}", args.device_type);
icicle_runtime::set_device(&icicle_runtime::Device::new(&args.device_type, 0)).unwrap();
}
fn main() {
let args = Args::parse();
println!("{:?}", args);
try_load_and_set_backend_device(&args);
let lower_bound = args.lower_bound_log_size;
let upper_bound = args.upper_bound_log_size;
println!("Running Icicle Examples: Rust MSM");
let upper_size = 1 << (upper_bound);
let upper_size = 1 << upper_bound;
println!("Generating random inputs on host for bn254...");
let upper_points = CurveCfg::generate_random_affine_points(upper_size);
let g2_upper_points = G2CurveCfg::generate_random_affine_points(upper_size);
@@ -56,59 +68,43 @@ fn main() {
let log_size = i;
let size = 1 << log_size;
println!(
"---------------------- MSM size 2^{}={} ------------------------",
"---------------------- MSM size 2^{} = {} ------------------------",
log_size, size
);
// Setting Bn254 points and scalars
let points = HostSlice::from_slice(&upper_points[..size]);
let g2_points = HostSlice::from_slice(&g2_upper_points[..size]);
let scalars = HostSlice::from_slice(&upper_scalars[..size]);
// Setting bls12377 points and scalars
// let points_bls12377 = &upper_points_bls12377[..size];
let points_bls12377 = HostSlice::from_slice(&upper_points_bls12377[..size]); // &upper_points_bls12377[..size];
let points_bls12377 = HostSlice::from_slice(&upper_points_bls12377[..size]);
let scalars_bls12377 = HostSlice::from_slice(&upper_scalars_bls12377[..size]);
println!("Configuring bn254 MSM...");
let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
let mut g2_msm_results = DeviceVec::<G2Projective>::cuda_malloc(1).unwrap();
let stream = CudaStream::create().unwrap();
let g2_stream = CudaStream::create().unwrap();
let mut msm_results = DeviceVec::<G1Projective>::device_malloc(1).unwrap();
let mut g2_msm_results = DeviceVec::<G2Projective>::device_malloc(1).unwrap();
let mut stream = IcicleStream::create().unwrap();
let mut g2_stream = IcicleStream::create().unwrap();
let mut cfg = msm::MSMConfig::default();
let mut g2_cfg = msm::MSMConfig::default();
cfg.ctx
.stream = &stream;
g2_cfg
.ctx
.stream = &g2_stream;
cfg.stream_handle = *stream;
cfg.is_async = true;
g2_cfg.stream_handle = *g2_stream;
g2_cfg.is_async = true;
println!("Configuring bls12377 MSM...");
let mut msm_results_bls12377 = DeviceVec::<BLS12377G1Projective>::cuda_malloc(1).unwrap();
let stream_bls12377 = CudaStream::create().unwrap();
let mut msm_results_bls12377 = DeviceVec::<BLS12377G1Projective>::device_malloc(1).unwrap();
let mut stream_bls12377 = IcicleStream::create().unwrap();
let mut cfg_bls12377 = msm::MSMConfig::default();
cfg_bls12377
.ctx
.stream = &stream_bls12377;
cfg_bls12377.stream_handle = *stream_bls12377;
cfg_bls12377.is_async = true;
println!("Executing bn254 MSM on device...");
#[cfg(feature = "profile")]
let start = Instant::now();
msm::msm(scalars, points, &cfg, &mut msm_results[..]).unwrap();
#[cfg(feature = "profile")]
println!(
"ICICLE BN254 MSM on size 2^{log_size} took: {} ms",
start
.elapsed()
.as_millis()
);
msm::msm(scalars, g2_points, &g2_cfg, &mut g2_msm_results[..]).unwrap();
println!("Executing bls12377 MSM on device...");
#[cfg(feature = "profile")]
let start = Instant::now();
msm::msm(
scalars_bls12377,
points_bls12377,
@@ -116,15 +112,8 @@ fn main() {
&mut msm_results_bls12377[..],
)
.unwrap();
#[cfg(feature = "profile")]
println!(
"ICICLE BLS12377 MSM on size 2^{log_size} took: {} ms",
start
.elapsed()
.as_millis()
);
println!("Moving results to host..");
println!("Moving results to host...");
let mut msm_host_result = vec![G1Projective::zero(); 1];
let mut g2_msm_host_result = vec![G2Projective::zero(); 1];
let mut msm_host_result_bls12377 = vec![BLS12377G1Projective::zero(); 1];
@@ -132,16 +121,17 @@ fn main() {
stream
.synchronize()
.unwrap();
g2_stream
.synchronize()
.unwrap();
msm_results
.copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
.unwrap();
println!("bn254 result: {:#?}", msm_host_result);
g2_stream
.synchronize()
.unwrap();
g2_msm_results
.copy_to_host(HostSlice::from_mut_slice(&mut g2_msm_host_result[..]))
.unwrap();
println!("bn254 result: {:#?}", msm_host_result);
println!("G2 bn254 result: {:#?}", g2_msm_host_result);
stream_bls12377
@@ -152,69 +142,14 @@ fn main() {
.unwrap();
println!("bls12377 result: {:#?}", msm_host_result_bls12377);
#[cfg(feature = "arkworks")]
{
println!("Checking against arkworks...");
let ark_points: Vec<Bn254G1Affine> = points
.iter()
.map(|&point| point.to_ark())
.collect();
let ark_scalars: Vec<Bn254Fr> = scalars
.iter()
.map(|scalar| scalar.to_ark())
.collect();
let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377
.iter()
.map(|point| point.to_ark())
.collect();
let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
.iter()
.map(|scalar| scalar.to_ark())
.collect();
#[cfg(feature = "profile")]
let start = Instant::now();
let bn254_ark_msm_res = Bn254ArkG1Projective::msm(&ark_points, &ark_scalars).unwrap();
println!("Arkworks Bn254 result: {:#?}", bn254_ark_msm_res);
#[cfg(feature = "profile")]
println!(
"Ark BN254 MSM on size 2^{log_size} took: {} ms",
start
.elapsed()
.as_millis()
);
#[cfg(feature = "profile")]
let start = Instant::now();
let bls12377_ark_msm_res =
Bls12377ArkG1Projective::msm(&ark_points_bls12377, &ark_scalars_bls12377).unwrap();
println!("Arkworks Bls12377 result: {:#?}", bls12377_ark_msm_res);
#[cfg(feature = "profile")]
println!(
"Ark BLS12377 MSM on size 2^{log_size} took: {} ms",
start
.elapsed()
.as_millis()
);
let bn254_icicle_msm_res_as_ark = msm_host_result[0].to_ark();
let bls12377_icicle_msm_res_as_ark = msm_host_result_bls12377[0].to_ark();
println!(
"Bn254 MSM is correct: {}",
bn254_ark_msm_res.eq(&bn254_icicle_msm_res_as_ark)
);
println!(
"Bls12377 MSM is correct: {}",
bls12377_ark_msm_res.eq(&bls12377_icicle_msm_res_as_ark)
);
}
println!("Cleaning up bn254...");
stream
.destroy()
.unwrap();
g2_stream
.destroy()
.unwrap();
println!("Cleaning up bls12377...");
stream_bls12377
.destroy()

View File

@@ -4,17 +4,12 @@ version = "1.2.0"
edition = "2018"
[dependencies]
icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
icicle-core = { path = "../../../wrappers/rust/icicle-core", features = ["arkworks"] }
icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["arkworks"] }
icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377", features = ["arkworks"] }
icicle-runtime = { path = "../../../wrappers/rust_v3/icicle-runtime" }
icicle-core = { path = "../../../wrappers/rust_v3/icicle-core" }
icicle-bn254 = { path = "../../../wrappers/rust_v3/icicle-curves/icicle-bn254", features = ["g2"] }
icicle-bls12-377 = { path = "../../../wrappers/rust_v3/icicle-curves/icicle-bls12-377" }
ark-ff = { version = "0.4.0" }
ark-poly = "0.4.0"
ark-std = "0.4.0"
ark-bn254 = { version = "0.4.0" }
ark-bls12-377 = { version = "0.4.0" }
clap = { version = "<=4.4.12", features = ["derive"] }
[features]
profile = []
cuda = ["icicle-runtime/cuda_backend", "icicle-bn254/cuda_backend", "icicle-bls12-377/cuda_backend"]

View File

@@ -4,10 +4,6 @@
`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).
## Best Practices
In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
## Usage
```rust
@@ -32,34 +28,9 @@ In this example we use the `BN254` and `BLS12377` fields.
7. Compare results with arkworks
Running the example:
```sh
cargo run --release
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```
You can add the `--feature profile` flag to measure times of both ICICLE and arkworks.
> [!NOTE]
> The default size is 2^20. You can change this by passing the `--size <size>` option. To change the size to 2^23, run the example like this:
```sh
cargo run --release -- -s 23
```
## Benchmarks
These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
### Single BN254 NTT
| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
|--------------|------|------|------|------|------|
| ICICLE | 1.263 ms | 2.986 ms | 4.651 ms | 9.308 ms | 18.618 ms |
| Arkworks | 138 ms | 290 ms | 611 ms | 1,295 ms | 2,715 ms |
### Single BLS12377 NTT
| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
|--------------|------|------|------|------|------|
| ICICLE | 1.272 ms | 2.893 ms | 4.728 ms | 9.211 ms | 18.319 ms |
| Arkworks | 135 ms | 286 ms | 605 ms | 1,279 ms | 2,682 ms |

59
examples/rust/ntt/run.sh Executable file
View File

@@ -0,0 +1,59 @@
#!/bin/bash
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then
echo "Building icicle with CUDA backend"
BACKEND_INSTALL_DIR="./target/release/deps/icicle/lib/backend"
cargo run --release --features=cuda -- --device-type "${DEVICE_TYPE}" --backend-install-dir "${BACKEND_INSTALL_DIR}"
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cargo run --release -- --device-type "${DEVICE_TYPE}" --backend-install-dir "${BACKEND_INSTALL_DIR}"
fi

View File

@@ -1,93 +1,94 @@
use icicle_bn254::curve::{ScalarCfg, ScalarField};
use icicle_bls12_377::curve::{ScalarCfg as BLS12377ScalarCfg, ScalarField as BLS12377ScalarField};
use icicle_bn254::curve::{ScalarCfg as Bn254ScalarCfg, ScalarField as Bn254ScalarField};
use icicle_runtime::memory::{DeviceVec, HostSlice};
use icicle_cuda_runtime::{
device_context::DeviceContext,
memory::{DeviceVec, HostSlice},
stream::CudaStream,
};
use clap::Parser;
use icicle_core::{
ntt::{self, initialize_domain},
traits::{FieldImpl, GenerateRandom},
};
use icicle_core::traits::ArkConvertible;
use ark_bls12_377::Fr as Bls12377Fr;
use ark_bn254::Fr as Bn254Fr;
use ark_ff::FftField;
use ark_poly::{EvaluationDomain, Radix2EvaluationDomain};
use ark_std::cmp::{Ord, Ordering};
use std::convert::TryInto;
#[cfg(feature = "profile")]
use std::time::Instant;
use clap::Parser;
#[derive(Parser, Debug)]
struct Args {
/// Size of NTT to run (20 for 2^20)
#[arg(short, long, default_value_t = 20)]
size: u8,
/// Device type (e.g., "CPU", "CUDA")
#[arg(short, long, default_value = "CPU")]
device_type: String,
/// Backend installation directory
#[arg(short, long, default_value = "")]
backend_install_dir: String,
}
// Load backend and set device
fn try_load_and_set_backend_device(args: &Args) {
if !args
.backend_install_dir
.is_empty()
{
println!("Trying to load backend from {}", &args.backend_install_dir);
icicle_runtime::runtime::load_backend(&args.backend_install_dir).unwrap();
}
println!("Setting device {}", args.device_type);
icicle_runtime::set_device(&icicle_runtime::Device::new(&args.device_type, 0)).unwrap();
}
fn main() {
let args = Args::parse();
println!("{:?}", args);
try_load_and_set_backend_device(&args);
println!("Running Icicle Examples: Rust NTT");
let log_size = args.size;
let size = 1 << log_size;
println!(
"---------------------- NTT size 2^{}={} ------------------------",
"---------------------- NTT size 2^{} = {} ------------------------",
log_size, size
);
// Setting Bn254 points and scalars
println!("Generating random inputs on host for bn254...");
let scalars = ScalarCfg::generate_random(size);
let mut ntt_results = DeviceVec::<ScalarField>::cuda_malloc(size).unwrap();
let scalars = Bn254ScalarCfg::generate_random(size);
let mut ntt_results = DeviceVec::<Bn254ScalarField>::device_malloc(size).unwrap();
// Setting bls12377 points and scalars
println!("Generating random inputs on host for bls12377...");
let scalars_bls12377 = BLS12377ScalarCfg::generate_random(size);
let mut ntt_results_bls12377 = DeviceVec::<BLS12377ScalarField>::cuda_malloc(size).unwrap();
let mut ntt_results_bls12377 = DeviceVec::<BLS12377ScalarField>::device_malloc(size).unwrap();
println!("Setting up bn254 Domain...");
let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
size.try_into()
.unwrap(),
initialize_domain(
ntt::get_root_of_unity::<Bn254ScalarField>(
size.try_into()
.unwrap(),
),
&ntt::NTTInitDomainConfig::default(),
)
.unwrap();
let ctx = DeviceContext::default();
initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
println!("Configuring bn254 NTT...");
let stream = CudaStream::create().unwrap();
let mut cfg = ntt::NTTConfig::<'_, ScalarField>::default();
cfg.ctx
.stream = &stream;
cfg.is_async = true;
let cfg = ntt::NTTConfig::<Bn254ScalarField>::default();
println!("Setting up bls12377 Domain...");
let icicle_omega = <Bls12377Fr as FftField>::get_root_of_unity(
size.try_into()
.unwrap(),
initialize_domain(
ntt::get_root_of_unity::<BLS12377ScalarField>(
size.try_into()
.unwrap(),
),
&ntt::NTTInitDomainConfig::default(),
)
.unwrap();
// reusing ctx from above
initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
println!("Configuring bls12377 NTT...");
let stream_bls12377 = CudaStream::create().unwrap();
let mut cfg_bls12377 = ntt::NTTConfig::<'_, BLS12377ScalarField>::default();
cfg_bls12377
.ctx
.stream = &stream_bls12377;
cfg_bls12377.is_async = true;
let cfg_bls12377 = ntt::NTTConfig::<BLS12377ScalarField>::default();
println!("Executing bn254 NTT on device...");
#[cfg(feature = "profile")]
let start = Instant::now();
ntt::ntt(
HostSlice::from_slice(&scalars),
@@ -96,7 +97,6 @@ fn main() {
&mut ntt_results[..],
)
.unwrap();
#[cfg(feature = "profile")]
println!(
"ICICLE BN254 NTT on size 2^{log_size} took: {} μs",
start
@@ -105,7 +105,6 @@ fn main() {
);
println!("Executing bls12377 NTT on device...");
#[cfg(feature = "profile")]
let start = Instant::now();
ntt::ntt(
HostSlice::from_slice(&scalars_bls12377),
@@ -114,7 +113,6 @@ fn main() {
&mut ntt_results_bls12377[..],
)
.unwrap();
#[cfg(feature = "profile")]
println!(
"ICICLE BLS12377 NTT on size 2^{log_size} took: {} μs",
start
@@ -122,82 +120,14 @@ fn main() {
.as_micros()
);
println!("Moving results to host..");
stream
.synchronize()
.unwrap();
let mut host_bn254_results = vec![ScalarField::zero(); size];
println!("Moving results to host...");
let mut host_bn254_results = vec![Bn254ScalarField::zero(); size];
ntt_results
.copy_to_host(HostSlice::from_mut_slice(&mut host_bn254_results[..]))
.unwrap();
stream_bls12377
.synchronize()
.unwrap();
let mut host_bls12377_results = vec![BLS12377ScalarField::zero(); size];
ntt_results_bls12377
.copy_to_host(HostSlice::from_mut_slice(&mut host_bls12377_results[..]))
.unwrap();
println!("Checking against arkworks...");
let mut ark_scalars: Vec<Bn254Fr> = scalars
.iter()
.map(|scalar| scalar.to_ark())
.collect();
let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();
let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
.iter()
.map(|scalar| scalar.to_ark())
.collect();
let bls12_377_domain = <Radix2EvaluationDomain<Bls12377Fr> as EvaluationDomain<Bls12377Fr>>::new(size).unwrap();
#[cfg(feature = "profile")]
let start = Instant::now();
bn254_domain.fft_in_place(&mut ark_scalars);
#[cfg(feature = "profile")]
println!(
"Ark BN254 NTT on size 2^{log_size} took: {} ms",
start
.elapsed()
.as_millis()
);
#[cfg(feature = "profile")]
let start = Instant::now();
bls12_377_domain.fft_in_place(&mut ark_scalars_bls12377);
#[cfg(feature = "profile")]
println!(
"Ark BLS12377 NTT on size 2^{log_size} took: {} ms",
start
.elapsed()
.as_millis()
);
host_bn254_results
.iter()
.zip(ark_scalars.iter())
.for_each(|(icicle_scalar, &ark_scalar)| {
assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
});
println!("Bn254 NTT is correct");
host_bls12377_results
.iter()
.zip(ark_scalars_bls12377.iter())
.for_each(|(icicle_scalar, &ark_scalar)| {
assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
});
println!("Bls12377 NTT is correct");
println!("Cleaning up bn254...");
stream
.destroy()
.unwrap();
println!("Cleaning up bls12377...");
stream_bls12377
.destroy()
.unwrap();
println!("");
}

View File

@@ -4,11 +4,13 @@ version = "1.2.0"
edition = "2018"
[dependencies]
icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
icicle-core = { path = "../../../wrappers/rust/icicle-core" }
icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254" }
icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear" }
icicle-runtime = { path = "../../../wrappers/rust_v3/icicle-runtime" }
icicle-core = { path = "../../../wrappers/rust_v3/icicle-core" }
icicle-bn254 = { path = "../../../wrappers/rust_v3/icicle-curves/icicle-bn254", features = ["g2"] }
icicle-babybear = { path = "../../../wrappers/rust_v3/icicle-fields/icicle-babybear" }
clap = { version = "<=4.4.12", features = ["derive"] }
[features]
profile = []
cuda = ["icicle-runtime/cuda_backend", "icicle-bn254/cuda_backend", "icicle-babybear/cuda_backend"]

View File

@@ -0,0 +1,15 @@
# ICICLE example: Polynomial API
## Key-Takeaway
`ICICLE` provides Rust bindings to Polynomial API [https://dev.ingonyama.com/icicle/rust-bindings/polynomials]
In this example we use the `BN254` and `babybear` fields to demonstrate how to compute on polynomials.
Running the example:
```sh
# for CPU
./run.sh -d CPU
# for CUDA
./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
```

View File

@@ -0,0 +1,59 @@
#!/bin/bash
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage information
show_help() {
echo "Usage: $0 [-d DEVICE_TYPE] [-b BACKEND_INSTALL_DIR]"
echo
echo "Options:"
echo " -d DEVICE_TYPE Specify the device type (default: CPU)"
echo " -b BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)"
echo " -h Show this help message"
exit 0
}
# Parse command line options
while getopts ":d:b:h" opt; do
case ${opt} in
d )
DEVICE_TYPE=$OPTARG
;;
b )
BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
;;
h )
show_help
;;
\? )
echo "Invalid option: -$OPTARG" 1>&2
show_help
;;
: )
echo "Invalid option: -$OPTARG requires an argument" 1>&2
show_help
;;
esac
done
# Set default values if not provided
: "${DEVICE_TYPE:=CPU}"
: "${BACKEND_INSTALL_DIR:=}"
# Create necessary directories
mkdir -p build/example
mkdir -p build/icicle
ICILE_DIR=$(realpath "../../../icicle_v3/")
ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda"
# Build Icicle and the example app that links to it
if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then
echo "Building icicle with CUDA backend"
BACKEND_INSTALL_DIR="./target/release/deps/icicle/lib/backend"
cargo run --release --features=cuda -- --device-type "${DEVICE_TYPE}" --backend-install-dir "${BACKEND_INSTALL_DIR}"
else
echo "Building icicle without CUDA backend, BACKEND_INSTALL_DIR=${BACKEND_INSTALL_DIR}"
cargo run --release -- --device-type "${DEVICE_TYPE}" --backend-install-dir "${BACKEND_INSTALL_DIR}"
fi

View File

@@ -3,21 +3,16 @@ use icicle_babybear::polynomials::DensePolynomial as PolynomialBabyBear;
use icicle_bn254::curve::ScalarField as bn254Scalar;
use icicle_bn254::polynomials::DensePolynomial as PolynomialBn254;
use icicle_cuda_runtime::{
device_context::DeviceContext,
memory::{DeviceVec, HostSlice},
};
use icicle_runtime::memory::{DeviceVec, HostSlice};
use icicle_core::{
ntt::{get_root_of_unity, initialize_domain},
ntt::{get_root_of_unity, initialize_domain, NTTInitDomainConfig},
polynomials::UnivariatePolynomial,
traits::{FieldImpl, GenerateRandom},
};
#[cfg(feature = "profile")]
use std::time::Instant;
use clap::Parser;
use std::time::Instant;
#[derive(Parser, Debug)]
struct Args {
@@ -26,21 +21,41 @@ struct Args {
max_ntt_log_size: u8,
#[arg(short, long, default_value_t = 15)]
poly_log_size: u8,
/// Device type (e.g., "CPU", "CUDA")
#[arg(short, long, default_value = "CPU")]
device_type: String,
/// Backend installation directory
#[arg(short, long, default_value = "/opt/icicle/backend")]
backend_install_dir: String,
}
fn init(max_ntt_size: u64) {
// initialize NTT domain for all fields!. Polynomials ops relies on NTT.
// Load backend and set device
fn try_load_and_set_backend_device(args: &Args) {
if !args
.backend_install_dir
.is_empty()
{
println!("Trying to load backend from {}", &args.backend_install_dir);
icicle_runtime::runtime::load_backend(&args.backend_install_dir).unwrap();
}
println!("Setting device {}", args.device_type);
let device = icicle_runtime::Device::new(&args.device_type, 0 /* =device_id*/);
icicle_runtime::set_device(&device).unwrap();
}
fn init_ntt_domain(max_ntt_size: u64) {
// Initialize NTT domain for all fields. Polynomial operations rely on NTT.
println!(
"Initializing NTT domain for max size 2^{}",
max_ntt_size.trailing_zeros()
);
let rou_bn254: bn254Scalar = get_root_of_unity(max_ntt_size);
let ctx = DeviceContext::default();
initialize_domain(rou_bn254, &ctx, false /*=fast twiddles mode*/).unwrap();
initialize_domain(rou_bn254, &NTTInitDomainConfig::default()).unwrap();
let rou_babybear: babybearScalar = get_root_of_unity(max_ntt_size);
initialize_domain(rou_babybear, &ctx, false /*=fast twiddles mode*/).unwrap();
// initialize the cuda backend for polynomials
// make sure to initialize it per field
PolynomialBn254::init_cuda_backend();
PolynomialBabyBear::init_cuda_backend();
initialize_domain(rou_babybear, &NTTInitDomainConfig::default()).unwrap();
}
fn randomize_poly<P>(size: usize, from_coeffs: bool) -> P
@@ -49,6 +64,7 @@ where
P::Field: FieldImpl,
P::FieldConfig: GenerateRandom<P::Field>,
{
println!("Randomizing polynomial of size {} (from_coeffs: {})", size, from_coeffs);
let coeffs_or_evals = P::FieldConfig::generate_random(size);
let p = if from_coeffs {
P::from_coeffs(HostSlice::from_slice(&coeffs_or_evals), size)
@@ -60,42 +76,61 @@ where
fn main() {
let args = Args::parse();
init(1 << args.max_ntt_log_size);
println!("{:?}", args);
try_load_and_set_backend_device(&args);
init_ntt_domain(1 << args.max_ntt_log_size);
// randomize three polynomials f,g,h over bn254 scalar field
let poly_size = 1 << args.poly_log_size;
println!("Randomizing polynomials [f(x),g(x),h(x)] over bn254 scalar field...");
let f = randomize_poly::<PolynomialBn254>(poly_size, true /*from random coeffs*/);
let g = randomize_poly::<PolynomialBn254>(poly_size / 2, true /*from random coeffs*/);
let h = randomize_poly::<PolynomialBn254>(poly_size / 4, false /*from random evaluations on rou*/);
// randomize two polynomials over babybear field
println!("Randomizing polynomials [f_babybear(x), g_babyber(x)] over babybear field...");
let f_babybear = randomize_poly::<PolynomialBabyBear>(poly_size, true /*from random coeffs*/);
let g_babybear = randomize_poly::<PolynomialBabyBear>(poly_size / 2, true /*from random coeffs*/);
let start = Instant::now();
// Arithmetic
println!("Computing t0(x) = f(x) + g(x)");
let t0 = &f + &g;
println!("Computing t1(x) f(x) * h(x)");
let t1 = &f * &h;
let (q, r) = t1.divide(&t0); // computes q,r for t1(x)=q(x)*t0(x)+r(x)
println!("Computing q(x),r(x) = t1(x)/t0(x) (where t1(x) = q(x) * t0(x) + r(x))");
let (q, r) = t1.divide(&t0);
println!("Computing f_babybear(x) * g_babybear(x)");
let _r_babybear = &f_babybear * &g_babybear;
// check degree
let _r_degree = r.degree();
// Check degree
println!("Degree of r(x): {}", r.degree());
// evaluate in single domain point
// Evaluate in single domain point
let five = bn254Scalar::from_u32(5);
println!("Evaluating q(5)");
let q_at_five = q.eval(&five);
// evaluate on domain. Note: domain and image can be either Host or Device slice.
// in this example domain in on host and evals on device.
// Evaluate on domain
let host_domain = [five, bn254Scalar::from_u32(30)];
let mut device_image = DeviceVec::<bn254Scalar>::cuda_malloc(host_domain.len()).unwrap();
t1.eval_on_domain(HostSlice::from_slice(&host_domain), &mut device_image[..]);
let mut device_image = DeviceVec::<bn254Scalar>::device_malloc(host_domain.len()).unwrap();
println!("Evaluating t1(x) on domain {:?}", host_domain);
t1.eval_on_domain(HostSlice::from_slice(&host_domain), &mut device_image[..]); // for NTT use eval_on_rou_domain()
// slicing
// Slicing
println!("Performing slicing operations on h");
let o = h.odd();
let e = h.even();
let fold = &e + &(&o * &q_at_five); // e(x) + o(x)*scalar
let fold = &e + &(&o * &q_at_five); // e(x) + o(x) * scalar
let _coeff = fold.get_coeff(2); // coeff of x^2
let _coeff = fold.get_coeff(2); // Coeff of x^2
println!(
"Polynomial computation on selected device took: {} ms",
start
.elapsed()
.as_millis()
);
}

3
examples/rust/poseidon/run.sh Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/bash
# TODO implement

View File

@@ -2,7 +2,8 @@ use icicle_bls12_381::curve::ScalarField as F;
use icicle_cuda_runtime::device_context::DeviceContext;
use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
use icicle_core::hash::{SpongeHash, HashConfig};
use icicle_core::poseidon::Poseidon;
use icicle_core::traits::FieldImpl;
use icicle_cuda_runtime::memory::HostSlice;
@@ -24,14 +25,14 @@ fn main() {
let test_size = 1 << size;
println!("Running Icicle Examples: Rust Poseidon Hash");
let arity = 2u32;
let arity = 2;
println!(
"---------------------- Loading optimized Poseidon constants for arity={} ------------------------",
arity
);
let ctx = DeviceContext::default();
let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
let config = PoseidonConfig::default();
let poseidon = Poseidon::load(arity, &ctx).unwrap();
let config = HashConfig::default();
println!(
"---------------------- Input size 2^{}={} ------------------------",
@@ -45,12 +46,12 @@ fn main() {
println!("Executing BLS12-381 Poseidon Hash on device...");
#[cfg(feature = "profile")]
let start = Instant::now();
poseidon_hash_many::<F>(
poseidon.hash_many(
input_slice,
output_slice,
test_size as u32,
arity as u32,
&constants,
test_size,
arity,
1,
&config,
)
.unwrap();

View File

@@ -1,5 +1,5 @@
function(check_field)
set(SUPPORTED_FIELDS babybear;stark252)
set(SUPPORTED_FIELDS babybear;stark252;m31)
set(IS_FIELD_SUPPORTED FALSE)
set(I 1000)

View File

@@ -9,46 +9,67 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "fields/stark_fields/babybear.cuh"
#include "ntt/ntt.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
#include "poseidon2/poseidon2.cuh"
extern "C" cudaError_t babybear_extension_ntt_cuda(
const babybear::extension_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::extension_t* output);
extern "C" cudaError_t babybear_create_poseidon2_constants_cuda(
int width,
int alpha,
int internal_rounds,
int external_rounds,
extern "C" cudaError_t babybear_poseidon2_create_cuda(
poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const babybear::scalar_t* round_constants,
const babybear::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t babybear_init_poseidon2_constants_cuda(
int width,
extern "C" cudaError_t babybear_poseidon2_load_cuda(
poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t babybear_poseidon2_hash_cuda(
const babybear::scalar_t* input,
extern "C" cudaError_t babybear_poseidon2_hash_many_cuda(
const poseidon2::Poseidon2<babybear::scalar_t>* poseidon,
const babybear::scalar_t* inputs,
babybear::scalar_t* output,
int number_of_states,
int width,
const poseidon2::Poseidon2Constants<babybear::scalar_t>& constants,
poseidon2::Poseidon2Config& config);
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t babybear_release_poseidon2_constants_cuda(
poseidon2::Poseidon2Constants<babybear::scalar_t>* constants,
device_context::DeviceContext& ctx);
extern "C" cudaError_t
babybear_poseidon2_delete_cuda(poseidon2::Poseidon2<babybear::scalar_t>* poseidon, device_context::DeviceContext& ctx);
extern "C" cudaError_t babybear_build_merkle_tree(
const babybear::scalar_t* leaves,
babybear::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* compression,
const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t babybear_mmcs_commit_cuda(
const matrix::Matrix<babybear::scalar_t>* leaves,
unsigned int number_of_inputs,
babybear::scalar_t* digests,
const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* hasher,
const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t babybear_mul_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);
@@ -72,10 +93,8 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda(
bool is_async);
extern "C" cudaError_t babybear_bit_reverse_cuda(
const babybear::scalar_t* input,
uint64_t n,
vec_ops::BitReverseConfig& config,
babybear::scalar_t* output);
const babybear::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::scalar_t* output);
extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
@@ -101,6 +120,9 @@ extern "C" cudaError_t babybear_extension_mul_cuda(
extern "C" cudaError_t babybear_extension_add_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
extern "C" cudaError_t babybear_extension_accumulate_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t babybear_extension_sub_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
@@ -113,4 +135,8 @@ extern "C" cudaError_t babybear_extension_transpose_matrix_cuda(
bool on_device,
bool is_async);
extern "C" cudaError_t babybear_extension_bit_reverse_cuda(
const babybear::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::extension_t* output);
#endif

View File

@@ -9,12 +9,13 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bls12_377.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
bls12_377::g2_affine_t* bases,
@@ -65,32 +66,52 @@ extern "C" cudaError_t bls12_377_affine_convert_montgomery(
extern "C" cudaError_t bls12_377_projective_convert_montgomery(
bls12_377::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bls12_377::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bls12_377::scalar_t>* poseidon_constants);
extern "C" cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_377::scalar_t>* constants);
extern "C" cudaError_t bls12_377_poseidon_hash_cuda(
bls12_377::scalar_t* input,
bls12_377::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bls12_377_build_poseidon_merkle_tree(
extern "C" cudaError_t bls12_377_build_merkle_tree(
const bls12_377::scalar_t* leaves,
bls12_377::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_377_mmcs_commit_cuda(
const matrix::Matrix<bls12_377::scalar_t>* leaves,
unsigned int number_of_inputs,
bls12_377::scalar_t* digests,
const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* hasher,
const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_377_poseidon_create_cuda(
poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bls12_377::scalar_t* round_constants,
const bls12_377::scalar_t* mds_matrix,
const bls12_377::scalar_t* non_sparse_matrix,
const bls12_377::scalar_t* sparse_matrices,
const bls12_377::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_poseidon_load_cuda(
poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_poseidon_hash_many_cuda(
const poseidon::Poseidon<bls12_377::scalar_t>* poseidon,
const bls12_377::scalar_t* inputs,
bls12_377::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t
bls12_377_poseidon_delete_cuda(poseidon::Poseidon<bls12_377::scalar_t>* poseidon);
extern "C" cudaError_t bls12_377_mul_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
@@ -114,10 +135,8 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda(
bool is_async);
extern "C" cudaError_t bls12_377_bit_reverse_cuda(
const bls12_377::scalar_t* input,
uint64_t n,
vec_ops::BitReverseConfig& config,
bls12_377::scalar_t* output);
const bls12_377::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output);
extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);

View File

@@ -9,12 +9,13 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bls12_381.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
bls12_381::g2_affine_t* bases,
@@ -65,32 +66,52 @@ extern "C" cudaError_t bls12_381_affine_convert_montgomery(
extern "C" cudaError_t bls12_381_projective_convert_montgomery(
bls12_381::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bls12_381::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bls12_381::scalar_t>* poseidon_constants);
extern "C" cudaError_t bls12_381_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_381::scalar_t>* constants);
extern "C" cudaError_t bls12_381_poseidon_hash_cuda(
bls12_381::scalar_t* input,
bls12_381::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bls12_381_build_poseidon_merkle_tree(
extern "C" cudaError_t bls12_381_build_merkle_tree(
const bls12_381::scalar_t* leaves,
bls12_381::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_381_mmcs_commit_cuda(
const matrix::Matrix<bls12_381::scalar_t>* leaves,
unsigned int number_of_inputs,
bls12_381::scalar_t* digests,
const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* hasher,
const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_381_poseidon_create_cuda(
poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bls12_381::scalar_t* round_constants,
const bls12_381::scalar_t* mds_matrix,
const bls12_381::scalar_t* non_sparse_matrix,
const bls12_381::scalar_t* sparse_matrices,
const bls12_381::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_poseidon_load_cuda(
poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_poseidon_hash_many_cuda(
const poseidon::Poseidon<bls12_381::scalar_t>* poseidon,
const bls12_381::scalar_t* inputs,
bls12_381::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t
bls12_381_poseidon_delete_cuda(poseidon::Poseidon<bls12_381::scalar_t>* poseidon);
extern "C" cudaError_t bls12_381_mul_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
@@ -114,10 +135,8 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda(
bool is_async);
extern "C" cudaError_t bls12_381_bit_reverse_cuda(
const bls12_381::scalar_t* input,
uint64_t n,
vec_ops::BitReverseConfig& config,
bls12_381::scalar_t* output);
const bls12_381::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output);
extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);

View File

@@ -9,12 +9,13 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bn254.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
#include "poseidon2/poseidon2.cuh"
extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
@@ -66,63 +67,87 @@ extern "C" cudaError_t bn254_affine_convert_montgomery(
extern "C" cudaError_t bn254_projective_convert_montgomery(
bn254::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_create_poseidon2_constants_cuda(
int width,
int alpha,
int internal_rounds,
int external_rounds,
extern "C" cudaError_t bn254_poseidon2_create_cuda(
poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const bn254::scalar_t* round_constants,
const bn254::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t bn254_init_poseidon2_constants_cuda(
int width,
extern "C" cudaError_t bn254_poseidon2_load_cuda(
poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t bn254_poseidon2_hash_cuda(
const bn254::scalar_t* input,
extern "C" cudaError_t bn254_poseidon2_hash_many_cuda(
const poseidon2::Poseidon2<bn254::scalar_t>* poseidon,
const bn254::scalar_t* inputs,
bn254::scalar_t* output,
int number_of_states,
int width,
const poseidon2::Poseidon2Constants<bn254::scalar_t>& constants,
poseidon2::Poseidon2Config& config);
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t bn254_release_poseidon2_constants_cuda(
poseidon2::Poseidon2Constants<bn254::scalar_t>* constants,
device_context::DeviceContext& ctx);
extern "C" cudaError_t
bn254_poseidon2_delete_cuda(poseidon2::Poseidon2<bn254::scalar_t>* poseidon, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bn254::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bn254::scalar_t>* poseidon_constants);
extern "C" cudaError_t bn254_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bn254::scalar_t>* constants);
extern "C" cudaError_t bn254_poseidon_hash_cuda(
bn254::scalar_t* input,
bn254::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bn254::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bn254_build_poseidon_merkle_tree(
extern "C" cudaError_t bn254_build_merkle_tree(
const bn254::scalar_t* leaves,
bn254::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bn254::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* compression,
const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bn254_mmcs_commit_cuda(
const matrix::Matrix<bn254::scalar_t>* leaves,
unsigned int number_of_inputs,
bn254::scalar_t* digests,
const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* hasher,
const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bn254_poseidon_create_cuda(
poseidon::Poseidon<bn254::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bn254::scalar_t* round_constants,
const bn254::scalar_t* mds_matrix,
const bn254::scalar_t* non_sparse_matrix,
const bn254::scalar_t* sparse_matrices,
const bn254::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_poseidon_load_cuda(
poseidon::Poseidon<bn254::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_poseidon_hash_many_cuda(
const poseidon::Poseidon<bn254::scalar_t>* poseidon,
const bn254::scalar_t* inputs,
bn254::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t
bn254_poseidon_delete_cuda(poseidon::Poseidon<bn254::scalar_t>* poseidon);
extern "C" cudaError_t bn254_mul_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
@@ -146,10 +171,8 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda(
bool is_async);
extern "C" cudaError_t bn254_bit_reverse_cuda(
const bn254::scalar_t* input,
uint64_t n,
vec_ops::BitReverseConfig& config,
bn254::scalar_t* output);
const bn254::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bn254::scalar_t* output);
extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);

View File

@@ -9,12 +9,13 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bw6_761.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
bw6_761::g2_affine_t* bases,
@@ -65,32 +66,52 @@ extern "C" cudaError_t bw6_761_affine_convert_montgomery(
extern "C" cudaError_t bw6_761_projective_convert_montgomery(
bw6_761::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bw6_761::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bw6_761::scalar_t>* poseidon_constants);
extern "C" cudaError_t bw6_761_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bw6_761::scalar_t>* constants);
extern "C" cudaError_t bw6_761_poseidon_hash_cuda(
bw6_761::scalar_t* input,
bw6_761::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bw6_761_build_poseidon_merkle_tree(
extern "C" cudaError_t bw6_761_build_merkle_tree(
const bw6_761::scalar_t* leaves,
bw6_761::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bw6_761_mmcs_commit_cuda(
const matrix::Matrix<bw6_761::scalar_t>* leaves,
unsigned int number_of_inputs,
bw6_761::scalar_t* digests,
const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* hasher,
const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bw6_761_poseidon_create_cuda(
poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bw6_761::scalar_t* round_constants,
const bw6_761::scalar_t* mds_matrix,
const bw6_761::scalar_t* non_sparse_matrix,
const bw6_761::scalar_t* sparse_matrices,
const bw6_761::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_poseidon_load_cuda(
poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_poseidon_hash_many_cuda(
const poseidon::Poseidon<bw6_761::scalar_t>* poseidon,
const bw6_761::scalar_t* inputs,
bw6_761::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t
bw6_761_poseidon_delete_cuda(poseidon::Poseidon<bw6_761::scalar_t>* poseidon);
extern "C" cudaError_t bw6_761_mul_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
@@ -114,10 +135,8 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda(
bool is_async);
extern "C" cudaError_t bw6_761_bit_reverse_cuda(
const bw6_761::scalar_t* input,
uint64_t n,
vec_ops::BitReverseConfig& config,
bw6_761::scalar_t* output);
const bw6_761::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output);
extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);

View File

@@ -9,11 +9,12 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/grumpkin.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
grumpkin::affine_t* bases,
@@ -38,32 +39,52 @@ extern "C" cudaError_t grumpkin_affine_convert_montgomery(
extern "C" cudaError_t grumpkin_projective_convert_montgomery(
grumpkin::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t grumpkin_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const grumpkin::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<grumpkin::scalar_t>* poseidon_constants);
extern "C" cudaError_t grumpkin_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<grumpkin::scalar_t>* constants);
extern "C" cudaError_t grumpkin_poseidon_hash_cuda(
grumpkin::scalar_t* input,
grumpkin::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t grumpkin_build_poseidon_merkle_tree(
extern "C" cudaError_t grumpkin_build_merkle_tree(
const grumpkin::scalar_t* leaves,
grumpkin::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t grumpkin_mmcs_commit_cuda(
const matrix::Matrix<grumpkin::scalar_t>* leaves,
unsigned int number_of_inputs,
grumpkin::scalar_t* digests,
const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* hasher,
const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t grumpkin_poseidon_create_cuda(
poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const grumpkin::scalar_t* round_constants,
const grumpkin::scalar_t* mds_matrix,
const grumpkin::scalar_t* non_sparse_matrix,
const grumpkin::scalar_t* sparse_matrices,
const grumpkin::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t grumpkin_poseidon_load_cuda(
poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t grumpkin_poseidon_hash_many_cuda(
const poseidon::Poseidon<grumpkin::scalar_t>* poseidon,
const grumpkin::scalar_t* inputs,
grumpkin::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t
grumpkin_poseidon_delete_cuda(poseidon::Poseidon<grumpkin::scalar_t>* poseidon);
extern "C" cudaError_t grumpkin_mul_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
@@ -87,10 +108,8 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda(
bool is_async);
extern "C" cudaError_t grumpkin_bit_reverse_cuda(
const grumpkin::scalar_t* input,
uint64_t n,
vec_ops::BitReverseConfig& config,
grumpkin::scalar_t* output);
const grumpkin::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output);
extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);

View File

@@ -6,11 +6,25 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "hash/keccak/keccak.cuh"
#include "merkle-tree/merkle.cuh"
extern "C" cudaError_t
keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config);
extern "C" cudaError_t
keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config);
extern "C" cudaError_t build_keccak256_merkle_tree_cuda(
const uint8_t* leaves,
uint64_t* digests,
unsigned int height,
unsigned int input_block_len,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t build_keccak512_merkle_tree_cuda(
const uint8_t* leaves,
uint64_t* digests,
unsigned int height,
unsigned int input_block_len,
const merkle_tree::TreeBuilderConfig& tree_config);
#endif

94
icicle/include/api/m31.h Normal file
View File

@@ -0,0 +1,94 @@
// WARNING: This file is auto-generated by a script.
// Any changes made to this file may be overwritten.
// Please modify the code generation script instead.
// Path to the code generation script: scripts/gen_c_api.py
#pragma once
#ifndef M31_API_H
#define M31_API_H
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "fields/stark_fields/m31.cuh"
#include "vec_ops/vec_ops.cuh"
extern "C" cudaError_t m31_build_merkle_tree(
const m31::scalar_t* leaves,
m31::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<m31::scalar_t, m31::scalar_t>* compression,
const hash::Hasher<m31::scalar_t, m31::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t m31_mmcs_commit_cuda(
const matrix::Matrix<m31::scalar_t>* leaves,
unsigned int number_of_inputs,
m31::scalar_t* digests,
const hash::Hasher<m31::scalar_t, m31::scalar_t>* hasher,
const hash::Hasher<m31::scalar_t, m31::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t m31_mul_cuda(
m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
extern "C" cudaError_t m31_add_cuda(
m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
extern "C" cudaError_t m31_accumulate_cuda(
m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t m31_sub_cuda(
m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
extern "C" cudaError_t m31_transpose_matrix_cuda(
const m31::scalar_t* input,
uint32_t row_size,
uint32_t column_size,
m31::scalar_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
extern "C" cudaError_t m31_bit_reverse_cuda(
const m31::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::scalar_t* output);
extern "C" void m31_generate_scalars(m31::scalar_t* scalars, int size);
extern "C" cudaError_t m31_scalar_convert_montgomery(
m31::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" void m31_extension_generate_scalars(m31::extension_t* scalars, int size);
extern "C" cudaError_t m31_extension_scalar_convert_montgomery(
m31::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t m31_extension_mul_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_add_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_accumulate_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t m31_extension_sub_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_transpose_matrix_cuda(
const m31::extension_t* input,
uint32_t row_size,
uint32_t column_size,
m31::extension_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
extern "C" cudaError_t m31_extension_bit_reverse_cuda(
const m31::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::extension_t* output);
#endif

View File

@@ -9,10 +9,29 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "fields/stark_fields/stark252.cuh"
#include "ntt/ntt.cuh"
#include "vec_ops/vec_ops.cuh"
extern "C" cudaError_t stark252_build_merkle_tree(
const stark252::scalar_t* leaves,
stark252::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* compression,
const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t stark252_mmcs_commit_cuda(
const matrix::Matrix<stark252::scalar_t>* leaves,
unsigned int number_of_inputs,
stark252::scalar_t* digests,
const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* hasher,
const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t stark252_mul_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
@@ -35,10 +54,8 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda(
bool is_async);
extern "C" cudaError_t stark252_bit_reverse_cuda(
const stark252::scalar_t* input,
uint64_t n,
vec_ops::BitReverseConfig& config,
stark252::scalar_t* output);
const stark252::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, stark252::scalar_t* output);
extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);

View File

@@ -1,26 +1,29 @@
extern "C" cudaError_t ${FIELD}_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const ${FIELD}::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<${FIELD}::scalar_t>* poseidon_constants);
extern "C" cudaError_t ${FIELD}_poseidon_create_cuda(
poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const ${FIELD}::scalar_t* round_constants,
const ${FIELD}::scalar_t* mds_matrix,
const ${FIELD}::scalar_t* non_sparse_matrix,
const ${FIELD}::scalar_t* sparse_matrices,
const ${FIELD}::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t ${FIELD}_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<${FIELD}::scalar_t>* constants);
extern "C" cudaError_t ${FIELD}_poseidon_load_cuda(
poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t ${FIELD}_poseidon_hash_cuda(
${FIELD}::scalar_t* input,
extern "C" cudaError_t ${FIELD}_poseidon_hash_many_cuda(
const poseidon::Poseidon<${FIELD}::scalar_t>* poseidon,
const ${FIELD}::scalar_t* inputs,
${FIELD}::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
poseidon::PoseidonConfig& config);
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t ${FIELD}_build_poseidon_merkle_tree(
const ${FIELD}::scalar_t* leaves,
${FIELD}::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
extern "C" cudaError_t
${FIELD}_poseidon_delete_cuda(poseidon::Poseidon<${FIELD}::scalar_t>* poseidon);

View File

@@ -1,30 +1,34 @@
extern "C" cudaError_t ${FIELD}_create_poseidon2_constants_cuda(
int width,
int alpha,
int internal_rounds,
int external_rounds,
extern "C" cudaError_t ${FIELD}_poseidon2_create_cuda(
poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const ${FIELD}::scalar_t* round_constants,
const ${FIELD}::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t ${FIELD}_init_poseidon2_constants_cuda(
int width,
extern "C" cudaError_t ${FIELD}_poseidon2_load_cuda(
poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t ${FIELD}_poseidon2_hash_cuda(
const ${FIELD}::scalar_t* input,
extern "C" cudaError_t ${FIELD}_poseidon2_hash_many_cuda(
const poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon,
const ${FIELD}::scalar_t* inputs,
${FIELD}::scalar_t* output,
int number_of_states,
int width,
const poseidon2::Poseidon2Constants<${FIELD}::scalar_t>& constants,
poseidon2::Poseidon2Config& config);
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::HashConfig& cfg);
extern "C" cudaError_t ${FIELD}_release_poseidon2_constants_cuda(
poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* constants,
device_context::DeviceContext& ctx);
extern "C" cudaError_t
${FIELD}_poseidon2_delete_cuda(poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon, device_context::DeviceContext& ctx);

View File

@@ -0,0 +1,16 @@
extern "C" cudaError_t ${FIELD}_build_merkle_tree(
const ${FIELD}::scalar_t* leaves,
${FIELD}::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t ${FIELD}_mmcs_commit_cuda(
const matrix::Matrix<${FIELD}::scalar_t>* leaves,
unsigned int number_of_inputs,
${FIELD}::scalar_t* digests,
const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* hasher,
const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);

View File

@@ -17,4 +17,7 @@ extern "C" cudaError_t ${FIELD}_transpose_matrix_cuda(
${FIELD}::scalar_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
bool is_async);
extern "C" cudaError_t ${FIELD}_bit_reverse_cuda(
const ${FIELD}::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, ${FIELD}::scalar_t* output);

View File

@@ -17,4 +17,7 @@ extern "C" cudaError_t ${FIELD}_extension_transpose_matrix_cuda(
${FIELD}::extension_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
bool is_async);
extern "C" cudaError_t ${FIELD}_extension_bit_reverse_cuda(
const ${FIELD}::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, ${FIELD}::extension_t* output);

View File

@@ -22,7 +22,7 @@
typedef Affine<point_field_t> affine_t;
#define G2_CURVE_DEFINITIONS \
typedef ExtensionField<fq_config> g2_point_field_t; \
typedef ExtensionField<fq_config, point_field_t> g2_point_field_t; \
static constexpr g2_point_field_t g2_generator_x = \
g2_point_field_t{point_field_t{g2_gen_x_re}, point_field_t{g2_gen_x_im}}; \
static constexpr g2_point_field_t g2_generator_y = \

View File

@@ -44,7 +44,7 @@ public:
static constexpr HOST_DEVICE_INLINE Field from(uint32_t value)
{
storage<TLC> scalar;
storage<TLC> scalar{};
scalar.limbs[0] = value;
for (int i = 1; i < TLC; i++) {
scalar.limbs[i] = 0;
@@ -58,8 +58,10 @@ public:
if (logn > CONFIG::omegas_count) { THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Field: Invalid omega index"); }
storage_array<CONFIG::omegas_count, TLC> const omega = CONFIG::omega;
return Field{omega.storages[logn - 1]};
Field omega = Field{CONFIG::rou};
for (int i = 0; i < CONFIG::omegas_count - logn; i++)
omega = sqr(omega);
return omega;
}
static HOST_INLINE Field omega_inv(uint32_t logn)
@@ -70,8 +72,10 @@ public:
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Field: Invalid omega_inv index");
}
storage_array<CONFIG::omegas_count, TLC> const omega_inv = CONFIG::omega_inv;
return Field{omega_inv.storages[logn - 1]};
Field omega = inverse(Field{CONFIG::rou});
for (int i = 0; i < CONFIG::omegas_count - logn; i++)
omega = sqr(omega);
return omega;
}
static HOST_DEVICE_INLINE Field inv_log_size(uint32_t logn)
@@ -182,7 +186,7 @@ public:
if (REDUCTION_SIZE == 0) return xs;
const ff_wide_storage modulus = get_modulus_squared<REDUCTION_SIZE>();
Wide rs = {};
return sub_limbs<true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
return sub_limbs<2 * TLC, true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
}
template <unsigned MODULUS_MULTIPLE = 1>
@@ -190,24 +194,24 @@ public:
{
const ff_wide_storage modulus = get_modulus_squared<MODULUS_MULTIPLE>();
Wide rs = {};
sub_limbs<false>(modulus, xs.limbs_storage, rs.limbs_storage);
sub_limbs<2 * TLC, false>(modulus, xs.limbs_storage, rs.limbs_storage);
return rs;
}
friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
{
Wide rs = {};
add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
add_limbs<2 * TLC, false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
return sub_modulus_squared<1>(rs);
}
friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys)
{
Wide rs = {};
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
uint32_t carry = sub_limbs<2 * TLC, true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
if (carry == 0) return rs;
const ff_wide_storage modulus = get_modulus_squared<1>();
add_limbs<false>(rs.limbs_storage, modulus, rs.limbs_storage);
add_limbs<2 * TLC, false>(rs.limbs_storage, modulus, rs.limbs_storage);
return rs;
}
};
@@ -228,12 +232,6 @@ public:
}
}
template <unsigned MULTIPLIER = 1>
static constexpr HOST_DEVICE_INLINE ff_wide_storage modulus_wide()
{
return CONFIG::modulus_wide;
}
// return m
static constexpr HOST_DEVICE_INLINE ff_storage get_m() { return CONFIG::m; }
@@ -253,12 +251,11 @@ public:
}
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t
add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r)
{
r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
for (unsigned i = 1; i < n; i++)
for (unsigned i = 1; i < NLIMBS; i++)
r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
if (!CARRY_OUT) {
ptx::addc(0, 0);
@@ -267,71 +264,35 @@ public:
return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
}
// add or subtract limbs
template <bool SUBTRACT, bool CARRY_OUT>
template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t
add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
add_sub_limbs_device(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
{
const uint32_t* x = xs.limbs;
const uint32_t* y = ys.limbs;
uint32_t* r = rs.limbs;
return add_sub_u32_device<SUBTRACT, CARRY_OUT>(x, y, r, TLC);
return add_sub_u32_device<NLIMBS, SUBTRACT, CARRY_OUT>(x, y, r);
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr DEVICE_INLINE uint32_t
add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
{
const uint32_t* x = xs.limbs;
const uint32_t* y = ys.limbs;
uint32_t* r = rs.limbs;
return add_sub_u32_device<SUBTRACT, CARRY_OUT>(x, y, r, 2 * TLC);
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr HOST_INLINE uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
{
const uint32_t* x = xs.limbs;
const uint32_t* y = ys.limbs;
uint32_t* r = rs.limbs;
uint32_t carry = 0;
host_math::carry_chain<TLC, false, CARRY_OUT> chain;
for (unsigned i = 0; i < TLC; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
}
template <bool SUBTRACT, bool CARRY_OUT>
static constexpr HOST_INLINE uint32_t
add_sub_limbs_host(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
{
const uint32_t* x = xs.limbs;
const uint32_t* y = ys.limbs;
uint32_t* r = rs.limbs;
uint32_t carry = 0;
host_math::carry_chain<2 * TLC, false, CARRY_OUT> chain;
for (unsigned i = 0; i < 2 * TLC; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
}
template <bool CARRY_OUT, typename T>
static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T& xs, const T& ys, T& rs)
template <unsigned NLIMBS, bool CARRY_OUT>
static constexpr HOST_DEVICE_INLINE uint32_t
add_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
{
#ifdef __CUDA_ARCH__
return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
return add_sub_limbs_device<NLIMBS, false, CARRY_OUT>(xs, ys, rs);
#else
return add_sub_limbs_host<false, CARRY_OUT>(xs, ys, rs);
return host_math::template add_sub_limbs<NLIMBS, false, CARRY_OUT>(xs, ys, rs);
#endif
}
template <bool CARRY_OUT, typename T>
static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
template <unsigned NLIMBS, bool CARRY_OUT>
static constexpr HOST_DEVICE_INLINE uint32_t
sub_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
{
#ifdef __CUDA_ARCH__
return add_sub_limbs_device<true, CARRY_OUT>(xs, ys, rs);
return add_sub_limbs_device<NLIMBS, true, CARRY_OUT>(xs, ys, rs);
#else
return add_sub_limbs_host<true, CARRY_OUT>(xs, ys, rs);
return host_math::template add_sub_limbs<NLIMBS, true, CARRY_OUT>(xs, ys, rs);
#endif
}
@@ -531,7 +492,7 @@ public:
// are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
// less efficient SASS generated by nvcc, so this case needed separate handling.
if (b[0] == UINT32_MAX) {
add_sub_u32_device<true, false>(c, a, even, TLC);
add_sub_u32_device<TLC, true, false>(c, a, even);
for (i = 0; i < TLC - 1; i++)
odd[i] = a[i];
} else {
@@ -639,17 +600,18 @@ public:
__align__(16) uint32_t diffs[TLC];
// Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
// `carry1` and `carry2`.
uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
uint32_t carry2 = add_sub_u32_device<true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
uint32_t carry1 = add_sub_u32_device<(TLC >> 1), true, true>(&a[TLC >> 1], a, diffs);
uint32_t carry2 = add_sub_u32_device<(TLC >> 1), true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
// Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$.
// This is where the assumption about unset high bit of `a` and `b` is relevant.
multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]);
// Corrections that need to be performed when differences are negative.
// Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`.
if (carry1) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
if (carry2) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
if (carry1)
add_sub_u32_device<(TLC >> 1), true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
if (carry2) add_sub_u32_device<(TLC >> 1), true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
// Now that middle part is fully correct, it can be added to the result.
add_sub_u32_device<false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC);
add_sub_u32_device<TLC, false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1]);
// Carry from adding middle part has to be propagated to the highest limb.
for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
@@ -673,25 +635,12 @@ public:
}
}
static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
uint32_t* r = rs.limbs;
for (unsigned i = 0; i < TLC; i++) {
uint32_t carry = 0;
for (unsigned j = 0; j < TLC; j++)
r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
r[TLC + i] = carry;
}
}
static HOST_DEVICE_INLINE void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
{
#ifdef __CUDA_ARCH__
return multiply_raw_device(as, bs, rs);
#else
return multiply_raw_host(as, bs, rs);
return host_math::template multiply_raw<TLC>(as, bs, rs);
#endif
}
@@ -702,9 +651,9 @@ public:
return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs);
#else
Wide r_wide = {};
multiply_raw_host(as, get_neg_modulus(), r_wide.limbs_storage);
host_math::template multiply_raw<TLC>(as, get_neg_modulus(), r_wide.limbs_storage);
Field r = Wide::get_lower(r_wide);
add_limbs<false>(cs, r.limbs_storage, rs);
add_limbs<TLC, false>(cs, r.limbs_storage, rs);
#endif
}
@@ -713,7 +662,7 @@ public:
#ifdef __CUDA_ARCH__
return multiply_msb_raw_device(as, bs, rs);
#else
return multiply_raw_host(as, bs, rs);
return host_math::template multiply_raw<TLC>(as, bs, rs);
#endif
}
@@ -759,7 +708,7 @@ public:
if (REDUCTION_SIZE == 0) return xs;
const ff_storage modulus = get_modulus<REDUCTION_SIZE>();
Field rs = {};
return sub_limbs<true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
return sub_limbs<TLC, true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
}
friend std::ostream& operator<<(std::ostream& os, const Field& xs)
@@ -778,17 +727,17 @@ public:
friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys)
{
Field rs = {};
add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
add_limbs<TLC, false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
return sub_modulus<1>(rs);
}
friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys)
{
Field rs = {};
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
uint32_t carry = sub_limbs<TLC, true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
if (carry == 0) return rs;
const ff_storage modulus = get_modulus<1>();
add_limbs<false>(rs.limbs_storage, modulus, rs.limbs_storage);
add_limbs<TLC, false>(rs.limbs_storage, modulus, rs.limbs_storage);
return rs;
}
@@ -838,15 +787,23 @@ public:
uint32_t carry;
// As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
if (num_of_reductions() == 2) {
carry = sub_limbs<true>(r.limbs_storage, get_modulus<2>(), r_reduced);
carry = sub_limbs<TLC, true>(r.limbs_storage, get_modulus<2>(), r_reduced);
if (carry == 0) r = Field{r_reduced};
}
carry = sub_limbs<true>(r.limbs_storage, get_modulus<1>(), r_reduced);
carry = sub_limbs<TLC, true>(r.limbs_storage, get_modulus<1>(), r_reduced);
if (carry == 0) r = Field{r_reduced};
return r;
}
HOST_DEVICE_INLINE Field& operator=(Field const& other)
{
for (int i = 0; i < TLC; i++) {
this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
}
return *this;
}
friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
{
Wide xy = mul_wide(xs, ys); // full mult
@@ -933,7 +890,7 @@ public:
{
const ff_storage modulus = get_modulus<MODULUS_MULTIPLE>();
Field rs = {};
sub_limbs<false>(modulus, xs.limbs_storage, rs.limbs_storage);
sub_limbs<TLC, false>(modulus, xs.limbs_storage, rs.limbs_storage);
return rs;
}
@@ -963,7 +920,7 @@ public:
static constexpr HOST_DEVICE_INLINE bool lt(const Field& xs, const Field& ys)
{
ff_storage dummy = {};
uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, dummy);
uint32_t carry = sub_limbs<TLC, true>(xs.limbs_storage, ys.limbs_storage, dummy);
return carry;
}
@@ -983,12 +940,12 @@ public:
while (!(u == one) && !(v == one)) {
while (is_even(u)) {
u = div2(u);
if (is_odd(b)) add_limbs<false>(b.limbs_storage, modulus, b.limbs_storage);
if (is_odd(b)) add_limbs<TLC, false>(b.limbs_storage, modulus, b.limbs_storage);
b = div2(b);
}
while (is_even(v)) {
v = div2(v);
if (is_odd(c)) add_limbs<false>(c.limbs_storage, modulus, c.limbs_storage);
if (is_odd(c)) add_limbs<TLC, false>(c.limbs_storage, modulus, c.limbs_storage);
c = div2(c);
}
if (lt(v, u)) {

View File

@@ -33,6 +33,9 @@ namespace field_config = babybear;
#elif FIELD_ID == STARK_252
#include "fields/stark_fields/stark252.cuh"
namespace field_config = stark252;
#elif FIELD_ID == M31
#include "fields/stark_fields/m31.cuh"
namespace field_config = m31;
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More