diff --git a/.codespellignore b/.codespellignore
new file mode 100644
index 00000000..b41990df
--- /dev/null
+++ b/.codespellignore
@@ -0,0 +1,3 @@
+inout
+crate
+lmit
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
new file mode 100644
index 00000000..90fc5932
--- /dev/null
+++ b/.github/workflows/codespell.yml
@@ -0,0 +1,20 @@
+name: Check Spelling
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+jobs:
+  spelling-checker:
+    name: Check Spelling
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: codespell-project/actions-codespell@v2
+        with:
+          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
+          skip: ./**/target,./**/build
+          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-ignore_words_file
+          ignore_words_file: .codespellignore
diff --git a/README.md b/README.md
index c8f7e5c9..6a8f794c 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,8 @@ In case `clang-format` is missing on your system, you can install it  using the
 sudo apt install clang-format
 ```
 
+You will also need to install [codespell](https://github.com/codespell-project/codespell?tab=readme-ov-file#installation) to check for typos.
+
 This will ensure our custom hooks are run and will make it easier to follow our coding guidelines.
 
 ### Hall of Fame
diff --git a/examples/c++/Poseidon-hash/README.md b/examples/c++/Poseidon-hash/README.md
index b7f649b4..533a8db1 100644
--- a/examples/c++/Poseidon-hash/README.md
+++ b/examples/c++/Poseidon-hash/README.md
@@ -43,7 +43,7 @@ poseidon.hash_blocks(inBlocks, nBlocks, outHashes, hashType, stream);
 
 ## What's in the example
 
-1. Define the size of the example: the hight of the full binary Merkle tree. 
+1. Define the size of the example: the height of the full binary Merkle tree. 
 2. Hash blocks in parallel. The tree width determines the number of blocks to hash.
 3. Build a Merkle tree from the hashes.
 4. Use the tree to generate a membership proof for one of computed hashes.
diff --git a/examples/c++/multiply/README.md b/examples/c++/multiply/README.md
index 56ae4552..da65b7e5 100644
--- a/examples/c++/multiply/README.md
+++ b/examples/c++/multiply/README.md
@@ -6,7 +6,7 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s
 
 ## Key-Takeaway
 
-`Icicle` accelerates multiplication operation `*` using [Karatsuba algorythm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
+`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
 
 ## Concise Usage Explanation
 
diff --git a/goicicle/curves/bls12377/g1.go b/goicicle/curves/bls12377/g1.go
index b7c77eaf..a8dd5814 100644
--- a/goicicle/curves/bls12377/g1.go
+++ b/goicicle/curves/bls12377/g1.go
@@ -41,7 +41,7 @@ type G1BaseField struct {
 }
 
 /*
- * BaseField Constrctors
+ * BaseField Constructors
  */
 
 func (f *G1BaseField) SetZero() *G1BaseField {
diff --git a/goicicle/curves/bls12381/g1.go b/goicicle/curves/bls12381/g1.go
index dedd18a0..60a5a3ec 100644
--- a/goicicle/curves/bls12381/g1.go
+++ b/goicicle/curves/bls12381/g1.go
@@ -41,7 +41,7 @@ type G1BaseField struct {
 }
 
 /*
- * BaseField Constrctors
+ * BaseField Constructors
  */
 
 func (f *G1BaseField) SetZero() *G1BaseField {
diff --git a/goicicle/curves/bn254/g1.go b/goicicle/curves/bn254/g1.go
index b934d6ed..4e6cb14e 100644
--- a/goicicle/curves/bn254/g1.go
+++ b/goicicle/curves/bn254/g1.go
@@ -41,7 +41,7 @@ type G1BaseField struct {
 }
 
 /*
- * BaseField Constrctors
+ * BaseField Constructors
  */
 
 func (f *G1BaseField) SetZero() *G1BaseField {
diff --git a/goicicle/curves/bw6761/g1.go b/goicicle/curves/bw6761/g1.go
index 4b69ba05..acd764a0 100644
--- a/goicicle/curves/bw6761/g1.go
+++ b/goicicle/curves/bw6761/g1.go
@@ -41,7 +41,7 @@ type G1BaseField struct {
 }
 
 /*
- * BaseField Constrctors
+ * BaseField Constructors
  */
 
 func (f *G1BaseField) SetZero() *G1BaseField {
diff --git a/goicicle/templates/curves/g1.go.tmpl b/goicicle/templates/curves/g1.go.tmpl
index 595bd790..2edac159 100644
--- a/goicicle/templates/curves/g1.go.tmpl
+++ b/goicicle/templates/curves/g1.go.tmpl
@@ -23,7 +23,7 @@ type G1BaseField struct {
 }
 
 /*
- * BaseField Constrctors
+ * BaseField Constructors
  */
 
 func (f *G1BaseField) SetZero() *G1BaseField {
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index 03c3963f..af61b7be 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -328,7 +328,7 @@ namespace msm {
       if (tid >= nof_msms) return;
       P final_result = P::zero();
       // Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore
-      // skiping them.
+      // skipping them.
       for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) {
         final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
         for (unsigned j = 0; j < c; j++)                                 // double
@@ -348,7 +348,7 @@ namespace msm {
       A* points,
       unsigned batch_size,      // number of MSMs to compute
       unsigned single_msm_size, // number of elements per MSM (a.k.a N)
-      unsigned nof_points,      // numer of EC points in 'points' array. Must be either (1) single_msm_size if MSMs are
+      unsigned nof_points,      // number of EC points in 'points' array. Must be either (1) single_msm_size if MSMs are
                                 // sharing points or (2) single_msm_size*batch_size otherwise
       P* final_result,
       bool are_scalars_on_device,
@@ -558,7 +558,7 @@ namespace msm {
       CHK_IF_RETURN(cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream));
       CHK_IF_RETURN(cudaMemset(nof_large_buckets, 0, sizeof(unsigned)));
 
-      unsigned TOTAL_THREADS = 129000; // todo - device dependant
+      unsigned TOTAL_THREADS = 129000; // todo - device dependent
       unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS);
       unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length;
       NUM_THREADS = min(1 << 5, cutoff_nof_runs);
@@ -717,10 +717,10 @@ namespace msm {
             }
           }
           if (target_bits_count == 1) {
-            // Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guranteed to
+            // Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed to
             // be empty when target_windows_count>bitsize.
             // for example consider bitsize=253 and c=2. The reduction ends with 254 bms but the most significant one is
-            // guranteed to be zero since the scalars are 253b.
+            // guaranteed to be zero since the scalars are 253b.
             nof_bms_per_msm = target_windows_count;
             nof_empty_bms_per_batch = target_windows_count - bitsize;
             nof_bms_in_batch = nof_bms_per_msm * batch_size;
diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh
index 26f17f5e..eecb374e 100644
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -73,7 +73,7 @@ namespace msm {
     bool is_big_triangle;       /**< Whether to do "bucket accumulation" serially. Decreases computational complexity
                                  *   but also greatly decreases parallelism, so only suitable for large batches of MSMs.
                                  *   Default value: false. */
-    bool is_async;              /**< Whether to run the MSM asyncronously. If set to true, the MSM function will be
+    bool is_async;              /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
                                  *   non-blocking and you'd need to synchronize it explicitly by running
                                  *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
                                  *   function will block the current CPU thread. */
@@ -103,7 +103,7 @@ namespace msm {
    *
    * **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any
    * effect: `precompute_factor` (always equals 1) and `ctx.device_id` (0 device is always used).
-   * Also, it's currently better to use `batch_size=1` in most cases (expept with dealing with very many MSMs).
+   * Also, it's currently better to use `batch_size=1` in most cases (except with dealing with very many MSMs).
    */
   template <typename S, typename A, typename P>
   cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results);
diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu
index 935a03f8..afe4d4cb 100644
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -260,7 +260,7 @@ namespace ntt {
      * @param n_twiddles Size of `d_twiddles`
      * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
      * @param inverse true for iNTT
-     * @param coset should be array of lenght n or a nullptr if NTT is not computed on a coset
+     * @param coset should be array of length n or a nullptr if NTT is not computed on a coset
      * @param stream CUDA stream
      * @param is_async if false, perform sync of the supplied CUDA stream at the end of processing
      * @param d_output Output array
diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh
index 13bd1024..10c70988 100644
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -76,7 +76,7 @@ namespace ntt {
                                  *   `Ordering::kNN`. */
     bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
     bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
-    bool is_async;              /**< Whether to run the NTT asyncronously. If set to `true`, the NTT function will be
+    bool is_async;              /**< Whether to run the NTT asynchronously. If set to `true`, the NTT function will be
                                  *   non-blocking and you'd need to synchronize it explicitly by running
                                  *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT
                                  *   function will block the current CPU thread. */
diff --git a/icicle/appUtils/poseidon/poseidon.cu b/icicle/appUtils/poseidon/poseidon.cu
index bc852602..17a29038 100644
--- a/icicle/appUtils/poseidon/poseidon.cu
+++ b/icicle/appUtils/poseidon/poseidon.cu
@@ -147,7 +147,7 @@ Poseidon<S>::hash_blocks(const S* inp, size_t blocks, S* out, HashType hash_type
   CHK_IF_RETURN(cudaMallocAsync(&states, blocks * this->t * sizeof(S), stream));
 
   // This is where the input matrix of size Arity x NumberOfBlocks is
-  // padded and coppied to device in a T x NumberOfBlocks matrix
+  // padded and copied to device in a T x NumberOfBlocks matrix
   CHK_IF_RETURN(cudaMemcpy2DAsync(
     states, this->t * sizeof(S),       // Device pointer and device pitch
     inp, (this->t - 1) * sizeof(S),    // Host pointer and pitch
diff --git a/icicle/primitives/extension_field.cuh b/icicle/primitives/extension_field.cuh
index 3189469a..8c57ea53 100644
--- a/icicle/primitives/extension_field.cuh
+++ b/icicle/primitives/extension_field.cuh
@@ -119,10 +119,10 @@ public:
     return ExtensionField{real_prod + i_sq_times_im, re_im + im_re};
   }
 
-  template <uint32_t mutliplier, unsigned REDUCTION_SIZE = 1>
+  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
   static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
   {
-    return {FF::template mul_unsigned<mutliplier>(xs.real), FF::template mul_unsigned<mutliplier>(xs.imaginary)};
+    return {FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.imaginary)};
   }
 
   template <unsigned MODULUS_MULTIPLE = 1>
diff --git a/icicle/primitives/field.cuh b/icicle/primitives/field.cuh
index 2a0c3f19..4e89cd12 100644
--- a/icicle/primitives/field.cuh
+++ b/icicle/primitives/field.cuh
@@ -8,7 +8,7 @@
  * refactoring it is low in the priority list.
  *
  * Documentation of methods is intended to explain inner workings to developers working on icicle. In its current state
- * it mostly explains modular mutliplication and related methods. One important quirk of modern CUDA that's affecting
+ * it mostly explains modular multiplication and related methods. One important quirk of modern CUDA that's affecting
  * most methods is explained by [Niall Emmart](https://youtu.be/KAWlySN7Hm8?si=h7nzDujnvubWXeDX&t=4039). In short, when
  * 64-bit MAD (`r = a * b + c`) instructions get compiled down to SASS (CUDA assembly) they require two-register values
  * `r` and `c` to start from even register (e.g. `r` can live in registers 20 and 21, or 14 and 15, but not 15 and 16).
@@ -490,7 +490,7 @@ public:
     __align__(16) uint32_t odd[TLC - 1];
     size_t i;
     // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
-    // are neccessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
+    // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
     // less efficient SASS generated by nvcc, so this case needed separate handling.
     if (b[0] == UINT32_MAX) {
       add_sub_u32_device<true, false>(cs.limbs, a, even, TLC);
@@ -747,7 +747,7 @@ public:
    * This method reduces a Wide number `xs` modulo `p` and returns the result as a Field element.
    *
    * It is assumed that the high `2 * slack_bits` bits of `xs` are unset which is always the case for the product of 2
-   * numbers with thier high `slack_bits` unset. Larger Wide numbers should be reduced by subtracting an appropriate
+   * numbers with their high `slack_bits` unset. Larger Wide numbers should be reduced by subtracting an appropriate
    * factor of `modulus_squared` first.
    *
    * This function implements ["multi-precision Barrett"](https://github.com/ingonyama-zk/modular_multiplication). As
@@ -830,7 +830,7 @@ public:
     return mul * xs;
   }
 
-  template <uint32_t mutliplier, class T, unsigned REDUCTION_SIZE = 1>
+  template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
   static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs)
   {
     T rs = {};
@@ -840,11 +840,11 @@ public:
 #pragma unroll
 #endif
     for (unsigned i = 0; i < 32; i++) {
-      if (mutliplier & (1 << i)) {
+      if (multiplier & (1 << i)) {
         rs = is_zero ? temp : (rs + temp);
         is_zero = false;
       }
-      if (mutliplier & ((1 << (31 - i) - 1) << (i + 1))) break;
+      if (multiplier & ((1 << (31 - i) - 1) << (i + 1))) break;
       temp = temp + temp;
     }
     return rs;
diff --git a/icicle/utils/vec_ops.cuh b/icicle/utils/vec_ops.cuh
index 7ed1e9a1..3b7ce6ef 100644
--- a/icicle/utils/vec_ops.cuh
+++ b/icicle/utils/vec_ops.cuh
@@ -16,7 +16,7 @@ namespace vec_ops {
    * @param vec_b Second input vector.
    * @param n Size of vectors `vec_a` and `vec_b`.
    * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
-   * @param is_montgomery If true, inputs are expected to be in Montgomery form and results are retured in Montgomery
+   * @param is_montgomery If true, inputs are expected to be in Montgomery form and results are returned in Montgomery
    * form. If false - inputs and outputs are non-Montgomery.
    * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
    * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`.
diff --git a/scripts/hooks/pre-push b/scripts/hooks/pre-push
index d95160f2..6d9d265e 100755
--- a/scripts/hooks/pre-push
+++ b/scripts/hooks/pre-push
@@ -1,6 +1,15 @@
 #!/bin/bash
 
 status=0
+
+if [[ $(codespell --skip ./**/target,./**/build -I .codespellignore 2>&1) ]];
+then
+    echo "There are typos in some of the files you've changed. Please run the following to check what they are:"
+    echo "codespell --skip ./**/target,./**/build -I .codespellignore"
+    echo ""
+    status=1
+fi
+
 # Run clang-format on CUDA, C, and CPP files
 # clang-format writes to stderr in dry-run mode. In order to capture the output to detect if there are changes needed we redirect stderr to stdin
 if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]];
diff --git a/wrappers/rust/icicle-core/src/msm/mod.rs b/wrappers/rust/icicle-core/src/msm/mod.rs
index b2d56fda..d23bd4a2 100644
--- a/wrappers/rust/icicle-core/src/msm/mod.rs
+++ b/wrappers/rust/icicle-core/src/msm/mod.rs
@@ -53,7 +53,7 @@ pub struct MSMConfig<'a> {
     /// decreases parallelism, so only suitable for large batches of MSMs. Default value: false.
     pub is_big_triangle: bool,
 
-    /// Whether to run the MSM asyncronously. If set to `true`, the MSM function will be non-blocking
+    /// Whether to run the MSM asynchronously. If set to `true`, the MSM function will be non-blocking
     /// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
     /// If set to `false`, the MSM function will block the current CPU thread.
     pub is_async: bool,
diff --git a/wrappers/rust/icicle-core/src/ntt/mod.rs b/wrappers/rust/icicle-core/src/ntt/mod.rs
index e0fc30b0..577e2d6a 100644
--- a/wrappers/rust/icicle-core/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-core/src/ntt/mod.rs
@@ -54,7 +54,7 @@ pub struct NTTConfig<'a, S> {
     pub ordering: Ordering,
     are_inputs_on_device: bool,
     are_outputs_on_device: bool,
-    /// Whether to run the NTT asyncronously. If set to `true`, the NTT function will be non-blocking and you'd need to synchronize
+    /// Whether to run the NTT asynchronously. If set to `true`, the NTT function will be non-blocking and you'd need to synchronize
     /// it explicitly by running `stream.synchronize()`. If set to false, the NTT function will block the current CPU thread.
     pub is_async: bool,
 }