mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-09 13:07:59 -05:00
transpose kernel in vec_ops and rust binding (#462)
## Describe the changes This PR adds an extern C link to the transpose kernel, now in vec_ops.cu. Also Rust binding, and I updated the test check_ntt_batch to use the new transpose function. The test passes. ## Linked Issues Resolves # --------- Co-authored-by: LeonHibnik <leon@ingonyama.com>
This commit is contained in:
@@ -33,6 +33,14 @@ namespace vec_ops {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
__global__ void transpose_kernel(const E* in, E* out, uint32_t row_size, uint32_t column_size)
|
||||
{
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid >= row_size * column_size) return;
|
||||
out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <typename E, void (*Kernel)(E*, E*, int, E*)>
|
||||
@@ -101,6 +109,48 @@ namespace vec_ops {
|
||||
return VecOp<E, SubKernel>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
cudaError_t transpose_matrix(
|
||||
const E* mat_in,
|
||||
E* mat_out,
|
||||
uint32_t row_size,
|
||||
uint32_t column_size,
|
||||
device_context::DeviceContext& ctx,
|
||||
bool on_device,
|
||||
bool is_async)
|
||||
{
|
||||
int number_of_threads = MAX_THREADS_PER_BLOCK;
|
||||
int number_of_blocks = (row_size * column_size + number_of_threads - 1) / number_of_threads;
|
||||
cudaStream_t stream = ctx.stream;
|
||||
|
||||
const E* d_mat_in;
|
||||
E* d_allocated_input = nullptr;
|
||||
E* d_mat_out;
|
||||
if (!on_device) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, row_size * column_size * sizeof(E), ctx.stream));
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(
|
||||
d_allocated_input, mat_in, row_size * column_size * sizeof(E), cudaMemcpyHostToDevice, ctx.stream));
|
||||
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_mat_out, row_size * column_size * sizeof(E), ctx.stream));
|
||||
d_mat_in = d_allocated_input;
|
||||
} else {
|
||||
d_mat_in = mat_in;
|
||||
d_mat_out = mat_out;
|
||||
}
|
||||
|
||||
transpose_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(d_mat_in, d_mat_out, row_size, column_size);
|
||||
|
||||
if (!on_device) {
|
||||
CHK_IF_RETURN(
|
||||
cudaMemcpyAsync(mat_out, d_mat_out, row_size * column_size * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(d_mat_out, ctx.stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, ctx.stream));
|
||||
}
|
||||
if (!is_async) return CHK_STICKY(cudaStreamSynchronize(ctx.stream));
|
||||
|
||||
return CHK_LAST();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of [Mul](@ref Mul) function with the template parameters
|
||||
* `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
|
||||
@@ -146,4 +196,21 @@ namespace vec_ops {
|
||||
return Sub<curve_config::scalar_t>(vec_a, vec_b, n, config, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern version of transpose_batch function with the template parameter
|
||||
* `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, TransposeMatrix)(
|
||||
const curve_config::scalar_t* input,
|
||||
uint32_t row_size,
|
||||
uint32_t column_size,
|
||||
curve_config::scalar_t* output,
|
||||
device_context::DeviceContext& ctx,
|
||||
bool on_device,
|
||||
bool is_async)
|
||||
{
|
||||
return transpose_matrix<curve_config::scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
|
||||
}
|
||||
|
||||
} // namespace vec_ops
|
||||
@@ -95,6 +95,30 @@ namespace vec_ops {
|
||||
*/
|
||||
template <typename E>
|
||||
cudaError_t Sub(E* vec_a, E* vec_b, int n, VecOpsConfig<E>& config, E* result);
|
||||
|
||||
/**
|
||||
* Transposes an input matrix out-of-place inside GPU.
|
||||
* for example: for ([a[0],a[1],a[2],a[3]], 2, 2) it returns
|
||||
* [a[0],a[2],a[1],a[3]].
|
||||
* @param mat_in array of some object of type E of size row_size * column_size.
|
||||
* @param arr_out buffer of the same size as `mat_in` to write the transpose matrix into.
|
||||
* @param row_size size of rows.
|
||||
* @param column_size size of columns.
|
||||
* @param ctx Device context.
|
||||
* @param on_device Whether the input and output are on device.
|
||||
* @param is_async Whether to run the vector operations asynchronously.
|
||||
* @tparam E The type of elements `mat_in' and `mat_out`.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
template <typename E>
|
||||
cudaError_t transpose_batch(
|
||||
const E* mat_in,
|
||||
E* mat_out,
|
||||
uint32_t row_size,
|
||||
uint32_t column_size,
|
||||
device_context::DeviceContext& ctx,
|
||||
bool on_device,
|
||||
bool is_async);
|
||||
} // namespace vec_ops
|
||||
|
||||
#endif
|
||||
|
||||
@@ -9,6 +9,7 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use crate::{
|
||||
ntt::{initialize_domain, initialize_domain_fast_twiddles_mode, ntt, ntt_inplace, NTTDir, NttAlgorithm, Ordering},
|
||||
traits::{ArkConvertible, FieldImpl, GenerateRandom},
|
||||
vec_ops::{transpose_matrix, VecOps},
|
||||
};
|
||||
|
||||
use super::{NTTConfig, NTT};
|
||||
@@ -235,6 +236,7 @@ where
|
||||
pub fn check_ntt_batch<F: FieldImpl>()
|
||||
where
|
||||
<F as FieldImpl>::Config: NTT<F> + GenerateRandom<F>,
|
||||
<F as FieldImpl>::Config: VecOps<F>,
|
||||
{
|
||||
let test_sizes = [1 << 4, 1 << 12];
|
||||
let batch_sizes = [1, 1 << 4, 100];
|
||||
@@ -278,18 +280,38 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
let row_size = test_size as u32;
|
||||
let column_size = batch_size as u32;
|
||||
let on_device = false;
|
||||
let is_async = false;
|
||||
// for now, columns batching only works with MixedRadix NTT
|
||||
config.batch_size = batch_size as i32;
|
||||
config.columns_batch = true;
|
||||
let transposed_input =
|
||||
HostOrDeviceSlice::on_host(transpose_flattened_matrix(&scalars[..], batch_size));
|
||||
let mut transposed_input = HostOrDeviceSlice::on_host(vec![F::zero(); batch_size * test_size]);
|
||||
transpose_matrix(
|
||||
&scalars,
|
||||
row_size,
|
||||
column_size,
|
||||
&mut transposed_input,
|
||||
&config.ctx,
|
||||
on_device,
|
||||
is_async,
|
||||
)
|
||||
.unwrap();
|
||||
let mut col_batch_ntt_result =
|
||||
HostOrDeviceSlice::on_host(vec![F::zero(); batch_size * test_size]);
|
||||
ntt(&transposed_input, is_inverse, &config, &mut col_batch_ntt_result).unwrap();
|
||||
assert_eq!(
|
||||
batch_ntt_result[..],
|
||||
transpose_flattened_matrix(&col_batch_ntt_result[..], test_size)
|
||||
);
|
||||
transpose_matrix(
|
||||
&col_batch_ntt_result,
|
||||
column_size,
|
||||
row_size,
|
||||
&mut transposed_input,
|
||||
&config.ctx,
|
||||
on_device,
|
||||
is_async,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(batch_ntt_result[..], *transposed_input.as_slice());
|
||||
config.columns_batch = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,6 +63,16 @@ pub trait VecOps<F> {
|
||||
result: &mut HostOrDeviceSlice<F>,
|
||||
cfg: &VecOpsConfig,
|
||||
) -> IcicleResult<()>;
|
||||
|
||||
fn transpose(
|
||||
input: &HostOrDeviceSlice<F>,
|
||||
row_size: u32,
|
||||
column_size: u32,
|
||||
output: &mut HostOrDeviceSlice<F>,
|
||||
ctx: &DeviceContext,
|
||||
on_device: bool,
|
||||
is_async: bool,
|
||||
) -> IcicleResult<()>;
|
||||
}
|
||||
|
||||
fn check_vec_ops_args<F>(a: &HostOrDeviceSlice<F>, b: &HostOrDeviceSlice<F>, result: &mut HostOrDeviceSlice<F>) {
|
||||
@@ -118,6 +128,22 @@ where
|
||||
<<F as FieldImpl>::Config as VecOps<F>>::mul(a, b, result, cfg)
|
||||
}
|
||||
|
||||
pub fn transpose_matrix<F>(
|
||||
input: &HostOrDeviceSlice<F>,
|
||||
row_size: u32,
|
||||
column_size: u32,
|
||||
output: &mut HostOrDeviceSlice<F>,
|
||||
ctx: &DeviceContext,
|
||||
on_device: bool,
|
||||
is_async: bool,
|
||||
) -> IcicleResult<()>
|
||||
where
|
||||
F: FieldImpl,
|
||||
<F as FieldImpl>::Config: VecOps<F>,
|
||||
{
|
||||
<<F as FieldImpl>::Config as VecOps<F>>::transpose(input, row_size, column_size, output, ctx, on_device, is_async)
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! impl_vec_ops_field {
|
||||
(
|
||||
@@ -157,6 +183,17 @@ macro_rules! impl_vec_ops_field {
|
||||
cfg: *const VecOpsConfig,
|
||||
result: *mut $field,
|
||||
) -> CudaError;
|
||||
|
||||
#[link_name = concat!($field_prefix, "TransposeMatrix")]
|
||||
pub(crate) fn transpose_cuda(
|
||||
input: *const $field,
|
||||
row_size: u32,
|
||||
column_size: u32,
|
||||
output: *mut $field,
|
||||
ctx: *const DeviceContext,
|
||||
on_device: bool,
|
||||
is_async: bool,
|
||||
) -> CudaError;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,6 +251,29 @@ macro_rules! impl_vec_ops_field {
|
||||
.wrap()
|
||||
}
|
||||
}
|
||||
|
||||
fn transpose(
|
||||
input: &HostOrDeviceSlice<$field>,
|
||||
row_size: u32,
|
||||
column_size: u32,
|
||||
output: &mut HostOrDeviceSlice<$field>,
|
||||
ctx: &DeviceContext,
|
||||
on_device: bool,
|
||||
is_async: bool,
|
||||
) -> IcicleResult<()> {
|
||||
unsafe {
|
||||
$field_prefix_ident::transpose_cuda(
|
||||
input.as_ptr(),
|
||||
row_size,
|
||||
column_size,
|
||||
output.as_mut_ptr(),
|
||||
ctx as *const _ as *const DeviceContext,
|
||||
on_device,
|
||||
is_async,
|
||||
)
|
||||
.wrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user