transpose kernel in vec_ops and rust binding (#462)

## Describe the changes This PR adds an extern C link to the transpose kernel, now in vec_ops.cu. Also Rust binding, and I updated the test check_ntt_batch to use the new transpose function. The test passes. ## Linked Issues Resolves # --------- Co-authored-by: LeonHibnik <leon@ingonyama.com>
2026-01-09 13:07:59 -05:00 · 2024-04-09 07:47:33 +02:00
parent 4c9b3c00a5
commit 4a35eece51
4 changed files with 179 additions and 6 deletions
--- a/icicle/utils/vec_ops.cu
+++ b/icicle/utils/vec_ops.cu
@@ -33,6 +33,14 @@ namespace vec_ops {
      int tid = blockIdx.x * blockDim.x + threadIdx.x;
      if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
    }
+
+    template <typename E>
+    __global__ void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
+    {
+      int tid = blockDim.x * blockIdx.x + threadIdx.x;
+      if (tid >= row_size * column_size) return;
+      out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
+    }
  } // namespace

  template <typename E, void (*Kernel)(E*, E*, int, E*)>
@@ -101,6 +109,48 @@ namespace vec_ops {
    return VecOp<E, SubKernel>(vec_a, vec_b, n, config, result);
  }

+  template <typename E>
+  cudaError_t transpose_matrix(
+    const E* mat_in,
+    E* mat_out,
+    uint32_t row_size,
+    uint32_t column_size,
+    device_context::DeviceContext& ctx,
+    bool on_device,
+    bool is_async)
+  {
+    int number_of_threads = MAX_THREADS_PER_BLOCK;
+    int number_of_blocks = (row_size * column_size + number_of_threads - 1) / number_of_threads;
+    cudaStream_t stream = ctx.stream;
+
+    const E* d_mat_in;
+    E* d_allocated_input = nullptr;
+    E* d_mat_out;
+    if (!on_device) {
+      CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, row_size * column_size * sizeof(E), ctx.stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        d_allocated_input, mat_in, row_size * column_size * sizeof(E), cudaMemcpyHostToDevice, ctx.stream));
+
+      CHK_IF_RETURN(cudaMallocAsync(&d_mat_out, row_size * column_size * sizeof(E), ctx.stream));
+      d_mat_in = d_allocated_input;
+    } else {
+      d_mat_in = mat_in;
+      d_mat_out = mat_out;
+    }
+
+    transpose_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(d_mat_in, d_mat_out, row_size, column_size);
+
+    if (!on_device) {
+      CHK_IF_RETURN(
+        cudaMemcpyAsync(mat_out, d_mat_out, row_size * column_size * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_mat_out, ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, ctx.stream));
+    }
+    if (!is_async) return CHK_STICKY(cudaStreamSynchronize(ctx.stream));
+
+    return CHK_LAST();
+  }
+
  /**
   * Extern version of [Mul](@ref Mul) function with the template parameters
   * `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
@@ -146,4 +196,21 @@ namespace vec_ops {
    return Sub<curve_config::scalar_t>(vec_a, vec_b, n, config, result);
  }

+  /**
+   * Extern version of transpose_batch function with the template parameter
+   * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, TransposeMatrix)(
+    const curve_config::scalar_t* input,
+    uint32_t row_size,
+    uint32_t column_size,
+    curve_config::scalar_t* output,
+    device_context::DeviceContext& ctx,
+    bool on_device,
+    bool is_async)
+  {
+    return transpose_matrix<curve_config::scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
+  }
+
 } // namespace vec_ops
--- a/icicle/utils/vec_ops.cuh
+++ b/icicle/utils/vec_ops.cuh
@@ -95,6 +95,30 @@ namespace vec_ops {
   */
  template <typename E>
  cudaError_t Sub(E* vec_a, E* vec_b, int n, VecOpsConfig<E>& config, E* result);
+
+  /**
+   * Transposes an input matrix out-of-place inside GPU.
+   * for example: for ([a[0],a[1],a[2],a[3]], 2, 2) it returns
+   * [a[0],a[2],a[1],a[3]].
+   * @param mat_in array of some object of type E of size row_size * column_size.
+   * @param arr_out buffer of the same size as `mat_in` to write the transpose matrix into.
+   * @param row_size size of rows.
+   * @param column_size size of columns.
+   * @param ctx Device context.
+   * @param on_device Whether the input and output are on device.
+   * @param is_async Whether to run the vector operations asynchronously.
+   * @tparam E The type of elements `mat_in' and `mat_out`.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  template <typename E>
+  cudaError_t transpose_batch(
+    const E* mat_in,
+    E* mat_out,
+    uint32_t row_size,
+    uint32_t column_size,
+    device_context::DeviceContext& ctx,
+    bool on_device,
+    bool is_async);
 } // namespace vec_ops

 #endif
--- a/wrappers/rust/icicle-core/src/ntt/tests.rs
+++ b/wrappers/rust/icicle-core/src/ntt/tests.rs
@@ -9,6 +9,7 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use crate::{
    ntt::{initialize_domain, initialize_domain_fast_twiddles_mode, ntt, ntt_inplace, NTTDir, NttAlgorithm, Ordering},
    traits::{ArkConvertible, FieldImpl, GenerateRandom},
+    vec_ops::{transpose_matrix, VecOps},
 };

 use super::{NTTConfig, NTT};
@@ -235,6 +236,7 @@ where
 pub fn check_ntt_batch<F: FieldImpl>()
 where
    <F as FieldImpl>::Config: NTT<F> + GenerateRandom<F>,
+    <F as FieldImpl>::Config: VecOps<F>,
 {
    let test_sizes = [1 << 4, 1 << 12];
    let batch_sizes = [1, 1 << 4, 100];
@@ -278,18 +280,38 @@ where
                            }
                        }

+                        let row_size = test_size as u32;
+                        let column_size = batch_size as u32;
+                        let on_device = false;
+                        let is_async = false;
                        // for now, columns batching only works with MixedRadix NTT
                        config.batch_size = batch_size as i32;
                        config.columns_batch = true;
-                        let transposed_input =
-                            HostOrDeviceSlice::on_host(transpose_flattened_matrix(&scalars[..], batch_size));
+                        let mut transposed_input = HostOrDeviceSlice::on_host(vec![F::zero(); batch_size * test_size]);
+                        transpose_matrix(
+                            &scalars,
+                            row_size,
+                            column_size,
+                            &mut transposed_input,
+                            &config.ctx,
+                            on_device,
+                            is_async,
+                        )
+                        .unwrap();
                        let mut col_batch_ntt_result =
                            HostOrDeviceSlice::on_host(vec![F::zero(); batch_size * test_size]);
                        ntt(&transposed_input, is_inverse, &config, &mut col_batch_ntt_result).unwrap();
-                        assert_eq!(
-                            batch_ntt_result[..],
-                            transpose_flattened_matrix(&col_batch_ntt_result[..], test_size)
-                        );
+                        transpose_matrix(
+                            &col_batch_ntt_result,
+                            column_size,
+                            row_size,
+                            &mut transposed_input,
+                            &config.ctx,
+                            on_device,
+                            is_async,
+                        )
+                        .unwrap();
+                        assert_eq!(batch_ntt_result[..], *transposed_input.as_slice());
                        config.columns_batch = false;
                    }
                }
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -63,6 +63,16 @@ pub trait VecOps<F> {
        result: &mut HostOrDeviceSlice<F>,
        cfg: &VecOpsConfig,
    ) -> IcicleResult<()>;
+
+    fn transpose(
+        input: &HostOrDeviceSlice<F>,
+        row_size: u32,
+        column_size: u32,
+        output: &mut HostOrDeviceSlice<F>,
+        ctx: &DeviceContext,
+        on_device: bool,
+        is_async: bool,
+    ) -> IcicleResult<()>;
 }

 fn check_vec_ops_args<F>(a: &HostOrDeviceSlice<F>, b: &HostOrDeviceSlice<F>, result: &mut HostOrDeviceSlice<F>) {
@@ -118,6 +128,22 @@ where
    <<F as FieldImpl>::Config as VecOps<F>>::mul(a, b, result, cfg)
 }

+pub fn transpose_matrix<F>(
+    input: &HostOrDeviceSlice<F>,
+    row_size: u32,
+    column_size: u32,
+    output: &mut HostOrDeviceSlice<F>,
+    ctx: &DeviceContext,
+    on_device: bool,
+    is_async: bool,
+) -> IcicleResult<()>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    <<F as FieldImpl>::Config as VecOps<F>>::transpose(input, row_size, column_size, output, ctx, on_device, is_async)
+}
+
 #[macro_export]
 macro_rules! impl_vec_ops_field {
    (
@@ -157,6 +183,17 @@ macro_rules! impl_vec_ops_field {
                    cfg: *const VecOpsConfig,
                    result: *mut $field,
                ) -> CudaError;
+
+                #[link_name = concat!($field_prefix, "TransposeMatrix")]
+                pub(crate) fn transpose_cuda(
+                    input: *const $field,
+                    row_size: u32,
+                    column_size: u32,
+                    output: *mut $field,
+                    ctx: *const DeviceContext,
+                    on_device: bool,
+                    is_async: bool,
+                ) -> CudaError;
            }
        }

@@ -214,6 +251,29 @@ macro_rules! impl_vec_ops_field {
                    .wrap()
                }
            }
+
+            fn transpose(
+                input: &HostOrDeviceSlice<$field>,
+                row_size: u32,
+                column_size: u32,
+                output: &mut HostOrDeviceSlice<$field>,
+                ctx: &DeviceContext,
+                on_device: bool,
+                is_async: bool,
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::transpose_cuda(
+                        input.as_ptr(),
+                        row_size,
+                        column_size,
+                        output.as_mut_ptr(),
+                        ctx as *const _ as *const DeviceContext,
+                        on_device,
+                        is_async,
+                    )
+                    .wrap()
+                }
+            }
        }
    };
 }