[FEAT]: Add codespell to CI and pre-commit hooks (#344)

Add codespell to pre-commit hook/CI and fix typos
This commit is contained in:
Jeremy Felder
2024-01-22 14:27:52 +02:00
committed by GitHub
parent 45f6db666b
commit 69af0bef91
21 changed files with 62 additions and 28 deletions

3
.codespellignore Normal file
View File

@@ -0,0 +1,3 @@
inout
crate
lmit

20
.github/workflows/codespell.yml vendored Normal file
View File

@@ -0,0 +1,20 @@
name: Check Spelling
on:
pull_request:
branches:
- main
- dev
jobs:
spelling-checker:
name: Check Spelling
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: codespell-project/actions-codespell@v2
with:
# https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
skip: ./**/target,./**/build
# https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-ignore_words_file
ignore_words_file: .codespellignore

View File

@@ -105,6 +105,8 @@ In case `clang-format` is missing on your system, you can install it using the
sudo apt install clang-format sudo apt install clang-format
``` ```
You will also need to install [codespell](https://github.com/codespell-project/codespell?tab=readme-ov-file#installation) to check for typos.
This will ensure our custom hooks are run and will make it easier to follow our coding guidelines. This will ensure our custom hooks are run and will make it easier to follow our coding guidelines.
### Hall of Fame ### Hall of Fame

View File

@@ -43,7 +43,7 @@ poseidon.hash_blocks(inBlocks, nBlocks, outHashes, hashType, stream);
## What's in the example ## What's in the example
1. Define the size of the example: the hight of the full binary Merkle tree. 1. Define the size of the example: the height of the full binary Merkle tree.
2. Hash blocks in parallel. The tree width determines the number of blocks to hash. 2. Hash blocks in parallel. The tree width determines the number of blocks to hash.
3. Build a Merkle tree from the hashes. 3. Build a Merkle tree from the hashes.
4. Use the tree to generate a membership proof for one of computed hashes. 4. Use the tree to generate a membership proof for one of computed hashes.

View File

@@ -6,7 +6,7 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s
## Key-Takeaway ## Key-Takeaway
`Icicle` accelerates multiplication operation `*` using [Karatsuba algorythm](https://en.wikipedia.org/wiki/Karatsuba_algorithm) `Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
## Concise Usage Explanation ## Concise Usage Explanation

View File

@@ -41,7 +41,7 @@ type G1BaseField struct {
} }
/* /*
* BaseField Constrctors * BaseField Constructors
*/ */
func (f *G1BaseField) SetZero() *G1BaseField { func (f *G1BaseField) SetZero() *G1BaseField {

View File

@@ -41,7 +41,7 @@ type G1BaseField struct {
} }
/* /*
* BaseField Constrctors * BaseField Constructors
*/ */
func (f *G1BaseField) SetZero() *G1BaseField { func (f *G1BaseField) SetZero() *G1BaseField {

View File

@@ -41,7 +41,7 @@ type G1BaseField struct {
} }
/* /*
* BaseField Constrctors * BaseField Constructors
*/ */
func (f *G1BaseField) SetZero() *G1BaseField { func (f *G1BaseField) SetZero() *G1BaseField {

View File

@@ -41,7 +41,7 @@ type G1BaseField struct {
} }
/* /*
* BaseField Constrctors * BaseField Constructors
*/ */
func (f *G1BaseField) SetZero() *G1BaseField { func (f *G1BaseField) SetZero() *G1BaseField {

View File

@@ -23,7 +23,7 @@ type G1BaseField struct {
} }
/* /*
* BaseField Constrctors * BaseField Constructors
*/ */
func (f *G1BaseField) SetZero() *G1BaseField { func (f *G1BaseField) SetZero() *G1BaseField {

View File

@@ -328,7 +328,7 @@ namespace msm {
if (tid >= nof_msms) return; if (tid >= nof_msms) return;
P final_result = P::zero(); P final_result = P::zero();
// Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore // Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore
// skiping them. // skipping them.
for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) { for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) {
final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
for (unsigned j = 0; j < c; j++) // double for (unsigned j = 0; j < c; j++) // double
@@ -348,7 +348,7 @@ namespace msm {
A* points, A* points,
unsigned batch_size, // number of MSMs to compute unsigned batch_size, // number of MSMs to compute
unsigned single_msm_size, // number of elements per MSM (a.k.a N) unsigned single_msm_size, // number of elements per MSM (a.k.a N)
unsigned nof_points, // numer of EC points in 'points' array. Must be either (1) single_msm_size if MSMs are unsigned nof_points, // number of EC points in 'points' array. Must be either (1) single_msm_size if MSMs are
// sharing points or (2) single_msm_size*batch_size otherwise // sharing points or (2) single_msm_size*batch_size otherwise
P* final_result, P* final_result,
bool are_scalars_on_device, bool are_scalars_on_device,
@@ -558,7 +558,7 @@ namespace msm {
CHK_IF_RETURN(cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream)); CHK_IF_RETURN(cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream));
CHK_IF_RETURN(cudaMemset(nof_large_buckets, 0, sizeof(unsigned))); CHK_IF_RETURN(cudaMemset(nof_large_buckets, 0, sizeof(unsigned)));
unsigned TOTAL_THREADS = 129000; // todo - device dependant unsigned TOTAL_THREADS = 129000; // todo - device dependent
unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS); unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS);
unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length; unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length;
NUM_THREADS = min(1 << 5, cutoff_nof_runs); NUM_THREADS = min(1 << 5, cutoff_nof_runs);
@@ -717,10 +717,10 @@ namespace msm {
} }
} }
if (target_bits_count == 1) { if (target_bits_count == 1) {
// Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guranteed to // Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed to
// be empty when target_windows_count>bitsize. // be empty when target_windows_count>bitsize.
// for example consider bitsize=253 and c=2. The reduction ends with 254 bms but the most significant one is // for example consider bitsize=253 and c=2. The reduction ends with 254 bms but the most significant one is
// guranteed to be zero since the scalars are 253b. // guaranteed to be zero since the scalars are 253b.
nof_bms_per_msm = target_windows_count; nof_bms_per_msm = target_windows_count;
nof_empty_bms_per_batch = target_windows_count - bitsize; nof_empty_bms_per_batch = target_windows_count - bitsize;
nof_bms_in_batch = nof_bms_per_msm * batch_size; nof_bms_in_batch = nof_bms_per_msm * batch_size;

View File

@@ -73,7 +73,7 @@ namespace msm {
bool is_big_triangle; /**< Whether to do "bucket accumulation" serially. Decreases computational complexity bool is_big_triangle; /**< Whether to do "bucket accumulation" serially. Decreases computational complexity
* but also greatly decreases parallelism, so only suitable for large batches of MSMs. * but also greatly decreases parallelism, so only suitable for large batches of MSMs.
* Default value: false. */ * Default value: false. */
bool is_async; /**< Whether to run the MSM asyncronously. If set to true, the MSM function will be bool is_async; /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
* non-blocking and you'd need to synchronize it explicitly by running * non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
* function will block the current CPU thread. */ * function will block the current CPU thread. */
@@ -103,7 +103,7 @@ namespace msm {
* *
* **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any * **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any
* effect: `precompute_factor` (always equals 1) and `ctx.device_id` (0 device is always used). * effect: `precompute_factor` (always equals 1) and `ctx.device_id` (0 device is always used).
* Also, it's currently better to use `batch_size=1` in most cases (expept with dealing with very many MSMs). * Also, it's currently better to use `batch_size=1` in most cases (except with dealing with very many MSMs).
*/ */
template <typename S, typename A, typename P> template <typename S, typename A, typename P>
cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results); cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results);

View File

@@ -260,7 +260,7 @@ namespace ntt {
* @param n_twiddles Size of `d_twiddles` * @param n_twiddles Size of `d_twiddles`
* @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`. * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
* @param inverse true for iNTT * @param inverse true for iNTT
* @param coset should be array of lenght n or a nullptr if NTT is not computed on a coset * @param coset should be array of length n or a nullptr if NTT is not computed on a coset
* @param stream CUDA stream * @param stream CUDA stream
* @param is_async if false, perform sync of the supplied CUDA stream at the end of processing * @param is_async if false, perform sync of the supplied CUDA stream at the end of processing
* @param d_output Output array * @param d_output Output array

View File

@@ -76,7 +76,7 @@ namespace ntt {
* `Ordering::kNN`. */ * `Ordering::kNN`. */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */ bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */ bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
bool is_async; /**< Whether to run the NTT asyncronously. If set to `true`, the NTT function will be bool is_async; /**< Whether to run the NTT asynchronously. If set to `true`, the NTT function will be
* non-blocking and you'd need to synchronize it explicitly by running * non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT
* function will block the current CPU thread. */ * function will block the current CPU thread. */

View File

@@ -147,7 +147,7 @@ Poseidon<S>::hash_blocks(const S* inp, size_t blocks, S* out, HashType hash_type
CHK_IF_RETURN(cudaMallocAsync(&states, blocks * this->t * sizeof(S), stream)); CHK_IF_RETURN(cudaMallocAsync(&states, blocks * this->t * sizeof(S), stream));
// This is where the input matrix of size Arity x NumberOfBlocks is // This is where the input matrix of size Arity x NumberOfBlocks is
// padded and coppied to device in a T x NumberOfBlocks matrix // padded and copied to device in a T x NumberOfBlocks matrix
CHK_IF_RETURN(cudaMemcpy2DAsync( CHK_IF_RETURN(cudaMemcpy2DAsync(
states, this->t * sizeof(S), // Device pointer and device pitch states, this->t * sizeof(S), // Device pointer and device pitch
inp, (this->t - 1) * sizeof(S), // Host pointer and pitch inp, (this->t - 1) * sizeof(S), // Host pointer and pitch

View File

@@ -119,10 +119,10 @@ public:
return ExtensionField{real_prod + i_sq_times_im, re_im + im_re}; return ExtensionField{real_prod + i_sq_times_im, re_im + im_re};
} }
template <uint32_t mutliplier, unsigned REDUCTION_SIZE = 1> template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs) static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
{ {
return {FF::template mul_unsigned<mutliplier>(xs.real), FF::template mul_unsigned<mutliplier>(xs.imaginary)}; return {FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.imaginary)};
} }
template <unsigned MODULUS_MULTIPLE = 1> template <unsigned MODULUS_MULTIPLE = 1>

View File

@@ -8,7 +8,7 @@
* refactoring it is low in the priority list. * refactoring it is low in the priority list.
* *
* Documentation of methods is intended to explain inner workings to developers working on icicle. In its current state * Documentation of methods is intended to explain inner workings to developers working on icicle. In its current state
* it mostly explains modular mutliplication and related methods. One important quirk of modern CUDA that's affecting * it mostly explains modular multiplication and related methods. One important quirk of modern CUDA that's affecting
* most methods is explained by [Niall Emmart](https://youtu.be/KAWlySN7Hm8?si=h7nzDujnvubWXeDX&t=4039). In short, when * most methods is explained by [Niall Emmart](https://youtu.be/KAWlySN7Hm8?si=h7nzDujnvubWXeDX&t=4039). In short, when
* 64-bit MAD (`r = a * b + c`) instructions get compiled down to SASS (CUDA assembly) they require two-register values * 64-bit MAD (`r = a * b + c`) instructions get compiled down to SASS (CUDA assembly) they require two-register values
* `r` and `c` to start from even register (e.g. `r` can live in registers 20 and 21, or 14 and 15, but not 15 and 16). * `r` and `c` to start from even register (e.g. `r` can live in registers 20 and 21, or 14 and 15, but not 15 and 16).
@@ -490,7 +490,7 @@ public:
__align__(16) uint32_t odd[TLC - 1]; __align__(16) uint32_t odd[TLC - 1];
size_t i; size_t i;
// `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
// are neccessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
// less efficient SASS generated by nvcc, so this case needed separate handling. // less efficient SASS generated by nvcc, so this case needed separate handling.
if (b[0] == UINT32_MAX) { if (b[0] == UINT32_MAX) {
add_sub_u32_device<true, false>(cs.limbs, a, even, TLC); add_sub_u32_device<true, false>(cs.limbs, a, even, TLC);
@@ -747,7 +747,7 @@ public:
* This method reduces a Wide number `xs` modulo `p` and returns the result as a Field element. * This method reduces a Wide number `xs` modulo `p` and returns the result as a Field element.
* *
* It is assumed that the high `2 * slack_bits` bits of `xs` are unset which is always the case for the product of 2 * It is assumed that the high `2 * slack_bits` bits of `xs` are unset which is always the case for the product of 2
* numbers with thier high `slack_bits` unset. Larger Wide numbers should be reduced by subtracting an appropriate * numbers with their high `slack_bits` unset. Larger Wide numbers should be reduced by subtracting an appropriate
* factor of `modulus_squared` first. * factor of `modulus_squared` first.
* *
* This function implements ["multi-precision Barrett"](https://github.com/ingonyama-zk/modular_multiplication). As * This function implements ["multi-precision Barrett"](https://github.com/ingonyama-zk/modular_multiplication). As
@@ -830,7 +830,7 @@ public:
return mul * xs; return mul * xs;
} }
template <uint32_t mutliplier, class T, unsigned REDUCTION_SIZE = 1> template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs) static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs)
{ {
T rs = {}; T rs = {};
@@ -840,11 +840,11 @@ public:
#pragma unroll #pragma unroll
#endif #endif
for (unsigned i = 0; i < 32; i++) { for (unsigned i = 0; i < 32; i++) {
if (mutliplier & (1 << i)) { if (multiplier & (1 << i)) {
rs = is_zero ? temp : (rs + temp); rs = is_zero ? temp : (rs + temp);
is_zero = false; is_zero = false;
} }
if (mutliplier & ((1 << (31 - i) - 1) << (i + 1))) break; if (multiplier & ((1 << (31 - i) - 1) << (i + 1))) break;
temp = temp + temp; temp = temp + temp;
} }
return rs; return rs;

View File

@@ -16,7 +16,7 @@ namespace vec_ops {
* @param vec_b Second input vector. * @param vec_b Second input vector.
* @param n Size of vectors `vec_a` and `vec_b`. * @param n Size of vectors `vec_a` and `vec_b`.
* @param is_on_device If true, inputs and outputs are on device, if false - on the host. * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
* @param is_montgomery If true, inputs are expected to be in Montgomery form and results are retured in Montgomery * @param is_montgomery If true, inputs are expected to be in Montgomery form and results are returned in Montgomery
* form. If false - inputs and outputs are non-Montgomery. * form. If false - inputs and outputs are non-Montgomery.
* @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
* @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`. * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`.

View File

@@ -1,6 +1,15 @@
#!/bin/bash #!/bin/bash
status=0 status=0
if [[ $(codespell --skip ./**/target,./**/build -I .codespellignore 2>&1) ]];
then
echo "There are typos in some of the files you've changed. Please run the following to check what they are:"
echo "codespell --skip ./**/target,./**/build -I .codespellignore"
echo ""
status=1
fi
# Run clang-format on CUDA, C, and CPP files # Run clang-format on CUDA, C, and CPP files
# clang-format writes to stderr in dry-run mode. In order to capture the output to detect if there are changes needed we redirect stderr to stdin # clang-format writes to stderr in dry-run mode. In order to capture the output to detect if there are changes needed we redirect stderr to stdin
if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]];

View File

@@ -53,7 +53,7 @@ pub struct MSMConfig<'a> {
/// decreases parallelism, so only suitable for large batches of MSMs. Default value: false. /// decreases parallelism, so only suitable for large batches of MSMs. Default value: false.
pub is_big_triangle: bool, pub is_big_triangle: bool,
/// Whether to run the MSM asyncronously. If set to `true`, the MSM function will be non-blocking /// Whether to run the MSM asynchronously. If set to `true`, the MSM function will be non-blocking
/// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`. /// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
/// If set to `false`, the MSM function will block the current CPU thread. /// If set to `false`, the MSM function will block the current CPU thread.
pub is_async: bool, pub is_async: bool,

View File

@@ -54,7 +54,7 @@ pub struct NTTConfig<'a, S> {
pub ordering: Ordering, pub ordering: Ordering,
are_inputs_on_device: bool, are_inputs_on_device: bool,
are_outputs_on_device: bool, are_outputs_on_device: bool,
/// Whether to run the NTT asyncronously. If set to `true`, the NTT function will be non-blocking and you'd need to synchronize /// Whether to run the NTT asynchronously. If set to `true`, the NTT function will be non-blocking and you'd need to synchronize
/// it explicitly by running `stream.synchronize()`. If set to false, the NTT function will block the current CPU thread. /// it explicitly by running `stream.synchronize()`. If set to false, the NTT function will block the current CPU thread.
pub is_async: bool, pub is_async: bool,
} }