mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-11 16:38:27 -05:00
Compare commits
13 Commits
cpu-mult-o
...
mini-cours
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b9732e67e | ||
|
|
3d8a6fbca2 | ||
|
|
dadc5fcc24 | ||
|
|
8550aeddd3 | ||
|
|
1e44f59b37 | ||
|
|
c4105aa8d5 | ||
|
|
b754e66153 | ||
|
|
a0fa0c66b6 | ||
|
|
0fe27bd480 | ||
|
|
0c9ae9f4b4 | ||
|
|
714ea4a213 | ||
|
|
c6a4c2a6a7 | ||
|
|
e1ac80e8ce |
44
icicle/src/mini-course-examples/Makefile
Normal file
44
icicle/src/mini-course-examples/Makefile
Normal file
@@ -0,0 +1,44 @@
|
||||
build_test:
|
||||
mkdir -p work
|
||||
nvcc -o work/test -std=c++17 -arch=sm_80 -I. -I../../include test.cu
|
||||
|
||||
run_test:
|
||||
mkdir -p work
|
||||
nvcc -o work/test -std=c++17 -arch=sm_80 -I. -I../../include test.cu
|
||||
work/test
|
||||
|
||||
|
||||
build_perf:
|
||||
mkdir -p work
|
||||
nvcc -lineinfo -o work/perf -std=c++17 -arch=sm_80 -I. -I../../include perf_test.cu
|
||||
|
||||
run_perf:
|
||||
make build_perf
|
||||
work/perf
|
||||
|
||||
|
||||
build_mem:
|
||||
mkdir -p work
|
||||
nvcc -lineinfo -o work/mem -std=c++17 -arch=sm_80 -I. -I../../include memory_test.cu
|
||||
|
||||
run_mem:
|
||||
make build_mem
|
||||
work/mem
|
||||
|
||||
|
||||
build_transpose:
|
||||
mkdir -p work
|
||||
nvcc -lineinfo -o work/transpose -std=c++17 -arch=sm_80 -I. -I../../include transpose_test.cu
|
||||
|
||||
run_transpose:
|
||||
make build_transpose
|
||||
work/transpose
|
||||
|
||||
|
||||
build_compute:
|
||||
mkdir -p work
|
||||
nvcc -lineinfo -o work/compute -std=c++17 -arch=sm_80 -I. -I../../include compute_test.cu
|
||||
|
||||
run_compute:
|
||||
make build_compute
|
||||
work/compute
|
||||
130
icicle/src/mini-course-examples/compute_test.cu
Normal file
130
icicle/src/mini-course-examples/compute_test.cu
Normal file
@@ -0,0 +1,130 @@
|
||||
#include "fields/id.h"
|
||||
// #define FIELD_ID 1001
|
||||
#define CURVE_ID 3
|
||||
#include "curves/curve_config.cuh"
|
||||
// #include "fields/field_config.cuh"
|
||||
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <cub/device/device_radix_sort.cuh>
|
||||
|
||||
#include "fields/field.cuh"
|
||||
#include "curves/projective.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
|
||||
#include "kernels.cu"
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = 10;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
{
|
||||
return {(p1.x + p2.x) % p1.p};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static HOST_INLINE Dummy_Scalar rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// typedef field_config::scalar_t test_scalar;
|
||||
typedef curve_config::scalar_t test_scalar;
|
||||
typedef curve_config::projective_t test_projective;
|
||||
typedef curve_config::affine_t test_affine;
|
||||
|
||||
// typedef uint32_t test_t;
|
||||
// typedef int4 test_t;
|
||||
// typedef Dummy_Scalar test_t;
|
||||
typedef test_projective test_t;
|
||||
// typedef test_scalar test_t;
|
||||
|
||||
#define REPS 8
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
float kernel_time;
|
||||
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
int N = 1<<22;
|
||||
// int N = 1<<25;
|
||||
|
||||
test_t* arr1_h = new test_t[N];
|
||||
test_t* arr2_h = new test_t[N];
|
||||
|
||||
test_t *arr1_d, *arr2_d;
|
||||
|
||||
cudaMalloc(&arr1_d, N*sizeof(test_t));
|
||||
cudaMalloc(&arr2_d, N*sizeof(test_t));
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
arr1_h[i] = i > 100? arr1_h[i-100] : test_t::rand_host();
|
||||
// arr1_h[i] = i > 100? arr1_h[i-100] : rand();
|
||||
}
|
||||
|
||||
cudaMemcpy(arr1_d, arr1_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
|
||||
|
||||
int THREADS = 128;
|
||||
int BLOCKS = (N + THREADS - 1)/THREADS;
|
||||
|
||||
//warm up
|
||||
add_many_times<test_t,16><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
// multi_mult<test_t,8><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
cudaEventRecord(start, 0);
|
||||
|
||||
// add_many_times<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
// multi_add<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
// limb_mult_bench<REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
segment_sum<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, N);
|
||||
// shmem_segment_sum<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, N);
|
||||
// multi_mult<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
// multi_ntt8<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaStreamSynchronize(0);
|
||||
cudaEventElapsedTime(&kernel_time, start, stop);
|
||||
printf("kernel_time : %.3f ms.\n", kernel_time);
|
||||
// printf("normalized kernel_time : %.3f ms.\n", kernel_time/REPS);
|
||||
|
||||
return 0;
|
||||
}
|
||||
457
icicle/src/mini-course-examples/kernels.cu
Normal file
457
icicle/src/mini-course-examples/kernels.cu
Normal file
@@ -0,0 +1,457 @@
|
||||
|
||||
template <class T>
|
||||
__global__ void add_elements_kernel(const T* x, const T* y, T* result, const unsigned count)
|
||||
{
|
||||
const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= count) return;
|
||||
result[tid] = x[tid] + y[tid];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void fake_ntt_kernel(const T* x, T* result, const unsigned thread_count)
|
||||
{
|
||||
extern __shared__ T shmem[];
|
||||
const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= thread_count) return;
|
||||
shmem[4*threadIdx.x] = x[4*tid] + x[4*tid+1];
|
||||
shmem[4*threadIdx.x+1] = x[4*tid] + T::neg(x[4*tid+1]);
|
||||
shmem[4*threadIdx.x+2] = x[4*tid+2] + x[4*tid+3];
|
||||
shmem[4*threadIdx.x+3] = x[4*tid+2] + T::neg(x[4*tid+3]);
|
||||
__syncthreads();
|
||||
result[4*tid] = shmem[2*threadIdx.x] + shmem[2*threadIdx.x + 4*blockDim.x/2];
|
||||
result[4*tid+1] = shmem[2*threadIdx.x] + T::neg(shmem[2*threadIdx.x + 4*blockDim.x/2]);
|
||||
result[4*tid+2] = shmem[2*threadIdx.x+1] + shmem[2*threadIdx.x + 4*blockDim.x/2+1];
|
||||
result[4*tid+3] = shmem[2*threadIdx.x+1] + T::neg(shmem[2*threadIdx.x + 4*blockDim.x/2+1]);
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
__global__ void bugged_add_elements_kernel(const T* x, const T* y, T* result, const unsigned count)
|
||||
{
|
||||
const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (tid >= count) return;
|
||||
// printf("tid %d\n", tid);
|
||||
result[tid] = x[tid] + y[tid];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void bugged_fake_ntt_kernel(const T* x, T* result, const unsigned thread_count)
|
||||
{
|
||||
extern __shared__ T shmem[];
|
||||
const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// if (tid >= thread_count) return;
|
||||
// if (tid == 0){
|
||||
// for (int i = 0; i < 8; i++)
|
||||
// {
|
||||
// shmem[i]=T::zero();
|
||||
// }
|
||||
// }
|
||||
|
||||
shmem[4*threadIdx.x] = x[4*tid] + x[4*tid+1];
|
||||
shmem[4*threadIdx.x+1] = x[4*tid] + T::neg(x[4*tid+1]);
|
||||
shmem[4*threadIdx.x+2] = x[4*tid+2] + x[4*tid+1];
|
||||
shmem[4*threadIdx.x+4] = x[4*tid+2] + T::neg(x[4*tid+1]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// if (tid == 0){
|
||||
// for (int i = 0; i < 8; i++)
|
||||
// {
|
||||
// printf("%d ",shmem[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// }
|
||||
|
||||
// printf("tid: %d, addr1: %d, addr2: %d\n", tid, 2*threadIdx.x, 2*threadIdx.x + 4*blockDim.x);
|
||||
result[4*tid] = shmem[2*threadIdx.x] + shmem[2*threadIdx.x + 4*blockDim.x]; // Incorrect offset
|
||||
result[4*tid+1] = shmem[2*threadIdx.x] + T::neg(shmem[2*threadIdx.x + 4*blockDim.x]); // Incorrect offset
|
||||
result[4*tid+2] = shmem[2*threadIdx.x+1] + shmem[2*threadIdx.x + 4*blockDim.x+1]; // Incorrect offset
|
||||
result[4*tid+3] = shmem[2*threadIdx.x+1] + T::neg(shmem[2*threadIdx.x +4*blockDim.x+1]); // Incorrect offset
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void bucket_acc_naive(T* buckets, unsigned* indices, unsigned* sizes, unsigned nof_buckets){
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (tid >= nof_buckets) return;
|
||||
for (int i = 0; i < sizes[tid]; i++)
|
||||
{
|
||||
buckets[indices[tid]] = buckets[indices[tid]] + buckets[indices[tid]];
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void bucket_acc_memory_baseline(T* buckets1, T* buckets2, unsigned* indices, unsigned nof_buckets){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= nof_buckets) return;
|
||||
buckets2[indices[tid]] = buckets1[indices[tid]];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void bucket_acc_compute_baseline(T* buckets, unsigned* indices, unsigned* sizes, unsigned nof_buckets){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= nof_buckets) return;
|
||||
T bucket = buckets[indices[tid]];
|
||||
for (int j = 0; j < 100; j++)
|
||||
{
|
||||
for (int i = 0; i < sizes[tid]; i++)
|
||||
{
|
||||
bucket = bucket + bucket;
|
||||
}
|
||||
}
|
||||
buckets[indices[tid]] = bucket;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void bucket_acc_reg(T* buckets, unsigned* indices, unsigned* sizes, unsigned nof_buckets){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= nof_buckets) return;
|
||||
T bucket = buckets[indices[tid]];
|
||||
for (int i = 0; i < sizes[tid]; i++)
|
||||
{
|
||||
bucket = bucket + bucket;
|
||||
}
|
||||
buckets[indices[tid]] = bucket;
|
||||
}
|
||||
|
||||
|
||||
// #define NOF_TH 32*64
|
||||
|
||||
|
||||
template <class T, int SIZE_T>
|
||||
__global__ void device_memory_copy(void* arr1_raw, void* arr2_raw, unsigned size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= size/SIZE_T) return;
|
||||
T* arr1=(T*)arr1_raw;
|
||||
T* arr2=(T*)arr2_raw;
|
||||
arr2[tid] = arr1[tid];
|
||||
}
|
||||
|
||||
template <class T, int SIZE_T>
|
||||
__global__ void segmented_memory_copy(void* arr1_raw, void* arr2_raw, unsigned size, unsigned read_segment_length, unsigned nof_write_segments){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int nof_elements = size/SIZE_T;
|
||||
int write_segment_length = nof_elements / nof_write_segments;
|
||||
int r_segment_idx = tid / read_segment_length;
|
||||
int r_segment_tid = tid % read_segment_length;
|
||||
int w_segment_idx = r_segment_idx % nof_write_segments;
|
||||
int w_segment_tid = r_segment_idx / nof_write_segments;
|
||||
int addr = w_segment_idx * write_segment_length + w_segment_tid * read_segment_length + r_segment_tid;
|
||||
// if (tid < 50) printf("tid %d, addr %d\n", tid, addr);
|
||||
if (tid >= nof_elements) return;
|
||||
T* arr1=(T*)arr1_raw;
|
||||
T* arr2=(T*)arr2_raw;
|
||||
arr2[addr] = arr1[addr];
|
||||
}
|
||||
|
||||
|
||||
template <class T, int SIZE_T>
|
||||
__global__ void multi_memory_copy1(void* arr1_raw, void* arr2_raw, unsigned size, unsigned nof_elements_per_thread){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int nof_elements = size/SIZE_T;
|
||||
int segment_length = nof_elements / nof_elements_per_thread;
|
||||
if (tid >= segment_length) return;
|
||||
T* arr1=(T*)arr1_raw;
|
||||
T* arr2=(T*)arr2_raw;
|
||||
for (int i = 0; i < nof_elements_per_thread; i++)
|
||||
{
|
||||
arr2[tid + i*segment_length] = arr1[tid + i*segment_length];
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int SIZE_T>
|
||||
__global__ void multi_memory_copy2(void* arr1_raw, void* arr2_raw, unsigned size, unsigned nof_elements_per_thread){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int nof_elements = size/SIZE_T;
|
||||
int nof_threads = nof_elements / nof_elements_per_thread;
|
||||
if (tid >= nof_threads) return;
|
||||
T* arr1=(T*)arr1_raw;
|
||||
T* arr2=(T*)arr2_raw;
|
||||
for (int i = 0; i < nof_elements_per_thread; i++)
|
||||
{
|
||||
arr2[tid*nof_elements_per_thread + i] = arr1[tid*nof_elements_per_thread + i];
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void simple_memory_copy(T* in, T* out, unsigned size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= size) return;
|
||||
out[tid] = in[tid];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void naive_transpose_write(T *in, T *out, int row_length){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= row_length * row_length) return;
|
||||
int row_id = tid / row_length;
|
||||
int col_id = tid % row_length;
|
||||
out[col_id * row_length + row_id] = in[tid];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void naive_transpose_read(T *in, T *out, int row_length){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= row_length * row_length) return;
|
||||
int row_id = tid / row_length;
|
||||
int col_id = tid % row_length;
|
||||
out[tid] = in[col_id * row_length + row_id];
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
__global__ void shmem_transpose(T *in, T *out, int row_length){
|
||||
__shared__ T shmem[16][16];
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= row_length * row_length) return;
|
||||
int shmem_col_id = threadIdx.x / 16;
|
||||
int shmem_row_id = threadIdx.x % 16;
|
||||
int blocks_per_row = row_length / 16;
|
||||
int block_row_id = blockIdx.x / blocks_per_row;
|
||||
int block_col_id = blockIdx.x % blocks_per_row;
|
||||
// shmem[shmem_col_id][shmem_row_id] = in[block_row_id*row_length*16 + block_col_id*16 + shmem_col_id*row_length + shmem_row_id];
|
||||
shmem[shmem_row_id][shmem_col_id] = in[block_row_id*row_length*16 + block_col_id*16 + shmem_col_id*row_length + shmem_row_id];
|
||||
__syncthreads();
|
||||
// out[block_col_id*row_length*16 + block_row_id*16 + shmem_col_id*row_length + shmem_row_id] = shmem[shmem_row_id][shmem_col_id];
|
||||
out[block_col_id*row_length*16 + block_row_id*16 + shmem_col_id*row_length + shmem_row_id] = shmem[shmem_col_id][shmem_row_id];
|
||||
}
|
||||
|
||||
template <class T, int REPS>
|
||||
__global__ void add_many_times(T *in, T *out, int size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= size) return;
|
||||
T temp;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < REPS; i++)
|
||||
{
|
||||
temp = i? temp + temp : in[tid];
|
||||
}
|
||||
out[tid] = temp;
|
||||
}
|
||||
|
||||
|
||||
template <class T, int REPS>
|
||||
__global__ void multi_add(T *in, T *out, int size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int segment_length = size / REPS;
|
||||
if (tid >= segment_length) return;
|
||||
// #pragma unroll 1
|
||||
for (int i = 0; i < REPS; i++)
|
||||
{
|
||||
out[tid + i*segment_length] = in[tid + i*segment_length] + in[tid + i*segment_length];
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int SEG_SIZE>
|
||||
__global__ void segment_sum(T *inout, int size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int nof_segments = size / SEG_SIZE;
|
||||
if (tid >= nof_segments) return;
|
||||
T sum = T::zero();
|
||||
T sums_sum = T::zero();
|
||||
for (int i = 0; i < SEG_SIZE; i++)
|
||||
{
|
||||
sums_sum = sums_sum + sum;
|
||||
sum = sum + inout[tid * SEG_SIZE + i];
|
||||
}
|
||||
inout[tid * SEG_SIZE] = sums_sum;
|
||||
// inout[tid * SEG_SIZE] = sum;
|
||||
}
|
||||
|
||||
template <class T, int SEG_SIZE>
|
||||
__global__ void shmem_segment_sum(T *inout, int size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int nof_segments = size / SEG_SIZE;
|
||||
if (tid >= nof_segments) return;
|
||||
__shared__ T shmem[128*2];
|
||||
// T sum = T::zero();
|
||||
// T sums_sum = T::zero();
|
||||
shmem[2*threadIdx.x] = T::zero(); //sum
|
||||
shmem[2*threadIdx.x + 1] = T::zero(); //sums_sum
|
||||
for (int i = 0; i < SEG_SIZE; i++)
|
||||
{
|
||||
{T sum = shmem[2*threadIdx.x];
|
||||
T sums_sum = shmem[2*threadIdx.x + 1];
|
||||
shmem[2*threadIdx.x + 1] = sums_sum + sum;}
|
||||
// {T sum = shmem[2*(127-threadIdx.x)];
|
||||
// T sums_sum = shmem[2*(127-threadIdx.x) + 1];
|
||||
// shmem[2*(127-threadIdx.x) + 1] = sums_sum + sum;}
|
||||
// shmem[2*(127-threadIdx.x) + 1] = shmem[2*(127-threadIdx.x) + 1] + shmem[2*(127-threadIdx.x)];
|
||||
// shmem[2*threadIdx.x + 1] = shmem[2*threadIdx.x + 1] + shmem[2*threadIdx.x];
|
||||
// __syncthreads();
|
||||
{T sum = shmem[2*threadIdx.x];
|
||||
T sums_sum = inout[tid * SEG_SIZE + i];
|
||||
shmem[2*threadIdx.x] = sum + sums_sum;}
|
||||
// shmem[2*threadIdx.x] = shmem[2*threadIdx.x] + inout[tid * SEG_SIZE + i];
|
||||
// __syncthreads();
|
||||
}
|
||||
inout[tid * SEG_SIZE] = shmem[2*threadIdx.x + 1];
|
||||
// inout[tid * SEG_SIZE] = sum;
|
||||
}
|
||||
|
||||
template <class T, int REPS>
|
||||
__global__ void multi_mult(T *in, T *out, int size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int segment_length = size / REPS;
|
||||
if (tid >= segment_length) return;
|
||||
#pragma unroll 1
|
||||
for (int i = 0; i < REPS; i++)
|
||||
{
|
||||
out[tid + i*segment_length] = in[tid + i*segment_length] * in[tid + i*segment_length];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class E>
|
||||
DEVICE_INLINE void ntt8opt(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
|
||||
{
|
||||
E T;
|
||||
|
||||
T = X3 - X7;
|
||||
X7 = X3 + X7;
|
||||
X3 = X1 - X5;
|
||||
X5 = X1 + X5;
|
||||
X1 = X2 + X6;
|
||||
X2 = X2 - X6;
|
||||
X6 = X0 + X4;
|
||||
X0 = X0 - X4;
|
||||
|
||||
X4 = X6 + X1;
|
||||
X6 = X6 - X1;
|
||||
X1 = X3 + T;
|
||||
X3 = X3 - T;
|
||||
T = X5 + X7;
|
||||
X5 = X5 - X7;
|
||||
X7 = X0 + X2;
|
||||
X0 = X0 - X2;
|
||||
|
||||
X2 = X6 + X5;
|
||||
X6 = X6 - X5;
|
||||
X5 = X7 - X1;
|
||||
X1 = X7 + X1;
|
||||
X7 = X0 - X3;
|
||||
X3 = X0 + X3;
|
||||
X0 = X4 + T;
|
||||
X4 = X4 - T;
|
||||
}
|
||||
|
||||
|
||||
template <class E>
|
||||
DEVICE_INLINE void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
|
||||
{
|
||||
E Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7;
|
||||
|
||||
Y0 = X0 + X4;
|
||||
Y1 = X0 - X4;
|
||||
Y2 = X1 - X5;
|
||||
Y3 = X1 + X5;
|
||||
Y4 = X2 + X6;
|
||||
Y5 = X2 - X6;
|
||||
Y6 = X3 - X7;
|
||||
Y7 = X3 + X7;
|
||||
|
||||
X0 = Y0 + Y2;
|
||||
X1 = Y0 - Y2;
|
||||
X2 = Y1 - Y3;
|
||||
X3 = Y1 + Y3;
|
||||
X4 = Y4 + Y6;
|
||||
X5 = Y4 - Y6;
|
||||
X6 = Y5 - Y7;
|
||||
X7 = Y5 + Y7;
|
||||
|
||||
Y0 = X0 + X1;
|
||||
Y1 = X0 - X1;
|
||||
Y2 = X2 - X3;
|
||||
Y3 = X2 + X3;
|
||||
Y4 = X4 + X5;
|
||||
Y5 = X4 - X5;
|
||||
Y6 = X6 - X7;
|
||||
Y7 = X6 + X7;
|
||||
|
||||
X0 = Y0;
|
||||
X1 = Y1;
|
||||
X2 = Y2;
|
||||
X3 = Y3;
|
||||
X4 = Y4;
|
||||
X5 = Y5;
|
||||
X6 = Y6;
|
||||
X7 = Y7;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <class T>
|
||||
__global__ void multi_ntt8(T *in, T *out, int size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int segment_length = size / 8;
|
||||
if (tid >= segment_length) return;
|
||||
T X[8];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
X[i] = in[tid + i*segment_length];
|
||||
}
|
||||
// ntt8(X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7]);
|
||||
ntt8opt(X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
out[tid + i*segment_length] = X[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__device__ void mul_naive(uint32_t *a, uint32_t *b, uint32_t *r){
|
||||
__align__(8) uint32_t odd[2];
|
||||
r[0] = ptx::mul_lo(a[0], b[0]);
|
||||
r[1] = ptx::mul_hi(a[0], b[0]);
|
||||
r[1] = ptx::mad_lo(a[0], b[1], r[1]);
|
||||
r[1] = ptx::mad_lo(a[1], b[0], r[1]);
|
||||
r[2] = ptx::mul_lo(a[1], b[1]);
|
||||
r[2] = ptx::mad_hi(a[1], b[0], r[2]);
|
||||
r[2] = ptx::mad_hi(a[0], b[1], r[2]);
|
||||
r[3] = ptx::mul_hi(a[1], b[1]);
|
||||
|
||||
r[0] = ptx::add_cc(r[0], r[1]);
|
||||
r[1] = ptx::add_cc(r[2], r[3]);
|
||||
}
|
||||
|
||||
__device__ void mul_icicle(uint32_t *a, uint32_t *b, uint32_t *r){
|
||||
__align__(8) uint32_t odd[2];
|
||||
r[0] = ptx::mul_lo(a[0], b[0]);
|
||||
r[1] = ptx::mul_hi(a[0], b[0]);
|
||||
r[2] = ptx::mul_lo(a[1], b[1]);
|
||||
r[3] = ptx::mul_hi(a[1], b[1]);
|
||||
odd[0] = ptx::mul_lo(a[0], b[1]);
|
||||
odd[1] = ptx::mul_hi(a[0], b[1]);
|
||||
odd[0] = ptx::mad_lo(a[1], b[0], odd[0]);
|
||||
odd[1] = ptx::mad_hi(a[1], b[0], odd[1]);
|
||||
r[1] = ptx::add_cc(r[1], odd[0]);
|
||||
r[2] = ptx::addc_cc(r[2], odd[1]);
|
||||
r[3] = ptx::addc(r[3], 0);
|
||||
|
||||
r[0] = ptx::add_cc(r[0], r[1]);
|
||||
r[1] = ptx::add_cc(r[2], r[3]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <int REPS>
|
||||
__global__ void limb_mult_bench(uint32_t *in, uint32_t *out, int size){
|
||||
int tid = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (tid >= size/2) return;
|
||||
uint32_t res[4];
|
||||
res[0] = in[tid];
|
||||
res[1] = in[tid + size/2];
|
||||
// typename T::Wide temp;
|
||||
for (int i = 0; i < REPS; i++)
|
||||
{
|
||||
mul_naive(res, res, res);
|
||||
// mul_icicle(res, res, res);
|
||||
// T::multiply_raw_device(res.limbs_storage, res.limbs_storage, res.limbs_storage);
|
||||
// temp = T::mul_wide(res, res);
|
||||
}
|
||||
// out[tid] = T::reduce(temp);
|
||||
out[tid] = res[0];
|
||||
out[tid + size/2] = res[1];
|
||||
}
|
||||
114
icicle/src/mini-course-examples/memory_test.cu
Normal file
114
icicle/src/mini-course-examples/memory_test.cu
Normal file
@@ -0,0 +1,114 @@
|
||||
#include "fields/id.h"
|
||||
// #define FIELD_ID 1
|
||||
#define CURVE_ID 3
|
||||
#include "curves/curve_config.cuh"
|
||||
// #include "fields/field_config.cuh"
|
||||
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <cub/device/device_radix_sort.cuh>
|
||||
|
||||
#include "fields/field.cuh"
|
||||
#include "curves/projective.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
|
||||
#include "kernels.cu"
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = 10;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
{
|
||||
return {(p1.x + p2.x) % p1.p};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static HOST_INLINE Dummy_Scalar rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// typedef field_config::scalar_t test_scalar;
|
||||
typedef curve_config::scalar_t test_scalar;
|
||||
typedef curve_config::projective_t test_projective;
|
||||
typedef curve_config::affine_t test_affine;
|
||||
|
||||
typedef int test_t;
|
||||
// typedef int4 test_t;
|
||||
// typedef Dummy_Scalar test_t;
|
||||
// typedef test_projective test_t;
|
||||
// typedef test_scalar test_t;
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
float kernel_time;
|
||||
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
int N = 1<<25;
|
||||
|
||||
void *arr1, *arr2;
|
||||
|
||||
cudaMalloc(&arr1, N);
|
||||
cudaMalloc(&arr2, N);
|
||||
|
||||
int THREADS = 256;
|
||||
int BLOCKS = (N/sizeof(test_t) + THREADS - 1)/THREADS;
|
||||
|
||||
//warm up
|
||||
device_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N);
|
||||
segmented_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, 32, 1024);
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
cudaEventRecord(start, 0);
|
||||
|
||||
device_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N);
|
||||
// segmented_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, 2, 1024);
|
||||
// int elements_per_thread = 8;
|
||||
// BLOCKS = (N/sizeof(test_t)/elements_per_thread + THREADS - 1)/THREADS;
|
||||
// multi_memory_copy1<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, elements_per_thread);
|
||||
// multi_memory_copy2<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, elements_per_thread);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaStreamSynchronize(0);
|
||||
cudaEventElapsedTime(&kernel_time, start, stop);
|
||||
printf("kernel_time : %.3f ms.\n", kernel_time);
|
||||
|
||||
return 0;
|
||||
}
|
||||
199
icicle/src/mini-course-examples/perf_test.cu
Normal file
199
icicle/src/mini-course-examples/perf_test.cu
Normal file
@@ -0,0 +1,199 @@
|
||||
#include "fields/id.h"
|
||||
// #define FIELD_ID 1001
|
||||
#define CURVE_ID 3
|
||||
#include "curves/curve_config.cuh"
|
||||
// #include "fields/field_config.cuh"
|
||||
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <cub/device/device_radix_sort.cuh>
|
||||
|
||||
#include "fields/field.cuh"
|
||||
#include "curves/projective.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
|
||||
#include "kernels.cu"
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = 10;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
{
|
||||
return {(p1.x + p2.x) % p1.p};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static HOST_INLINE Dummy_Scalar rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// typedef field_config::scalar_t test_scalar;
|
||||
typedef curve_config::scalar_t test_scalar;
|
||||
typedef curve_config::projective_t test_projective;
|
||||
typedef curve_config::affine_t test_affine;
|
||||
|
||||
// typedef int test_t;
|
||||
// typedef int4 test_t;
|
||||
// typedef Dummy_Scalar test_t;
|
||||
// typedef test_projective test_t;
|
||||
typedef test_scalar test_t;
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
float kernel_time;
|
||||
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
int N = 1<<20;
|
||||
// int N = 1<<3;
|
||||
|
||||
test_t* buckets_h = new test_t[N];
|
||||
unsigned* indices_h = new unsigned[N];
|
||||
unsigned* sizes_h = new unsigned[N];
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
indices_h[i] = static_cast<unsigned>(i);
|
||||
sizes_h[i] = static_cast<unsigned>(std::rand())%20;
|
||||
// sizes_h[i] = 10;
|
||||
buckets_h[i] = i<100? test_t::rand_host() : buckets_h[i-100];
|
||||
// buckets_h[i] = i<100? rand() : buckets_h[i-100];
|
||||
// buckets_h[i].x = i<100? rand() : buckets_h[i-100].x;
|
||||
// buckets_h[i].y = i<100? rand() : buckets_h[i-100].y;
|
||||
// buckets_h[i].z = i<100? rand() : buckets_h[i-100].z;
|
||||
// buckets_h[i].w = i<100? rand() : buckets_h[i-100].w;
|
||||
// if (i<10) std::cout << indices_h[i] << " " << sizes_h[i] << " " << buckets_h[i] << std::endl;
|
||||
}
|
||||
|
||||
test_t *buckets_d, *buckets2_d;
|
||||
unsigned *sizes_d, *indices_d;
|
||||
|
||||
cudaMalloc(&buckets_d, sizeof(test_t) * N);
|
||||
cudaMalloc(&buckets2_d, sizeof(test_t) * N);
|
||||
cudaMalloc(&sizes_d, sizeof(unsigned) * N);
|
||||
cudaMalloc(&indices_d, sizeof(unsigned) * N);
|
||||
|
||||
cudaMemcpy(buckets_d, buckets_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(sizes_d, sizes_h, sizeof(unsigned) * N, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(indices_d, indices_h, sizeof(unsigned) * N, cudaMemcpyHostToDevice);
|
||||
|
||||
int THREADS = 256;
|
||||
int BLOCKS = (N + THREADS - 1)/THREADS;
|
||||
|
||||
//warm up
|
||||
bucket_acc_naive<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
cudaEventRecord(start, 0);
|
||||
|
||||
|
||||
// unsigned* sorted_sizes;
|
||||
// cudaMalloc(&sorted_sizes, sizeof(unsigned) * N);
|
||||
|
||||
// unsigned* sorted_indices;
|
||||
// cudaMalloc(&sorted_indices, sizeof(unsigned) * N);
|
||||
// unsigned* sort_indices_temp_storage{};
|
||||
// size_t sort_indices_temp_storage_bytes = 0;
|
||||
// cub::DeviceRadixSort::SortPairsDescending(
|
||||
// sort_indices_temp_storage, sort_indices_temp_storage_bytes, sizes_d,
|
||||
// sorted_sizes, indices_d, sorted_indices, N, 0);
|
||||
// cudaMalloc(&sort_indices_temp_storage, sort_indices_temp_storage_bytes);
|
||||
// cub::DeviceRadixSort::SortPairsDescending(
|
||||
// sort_indices_temp_storage, sort_indices_temp_storage_bytes, sizes_d,
|
||||
// sorted_sizes, indices_d, sorted_indices, N, 0);
|
||||
// cudaFree(sort_indices_temp_storage);
|
||||
|
||||
// test_t* sorted_buckets;
|
||||
// cudaMalloc(&sorted_buckets, sizeof(test_t) * N);
|
||||
// unsigned* sort_buckets_temp_storage{};
|
||||
// size_t sort_buckets_temp_storage_bytes = 0;
|
||||
// cub::DeviceRadixSort::SortPairsDescending(
|
||||
// sort_buckets_temp_storage, sort_buckets_temp_storage_bytes, sizes_d,
|
||||
// sorted_sizes, buckets_d, sorted_buckets, N, 0);
|
||||
// cudaMalloc(&sort_buckets_temp_storage, sort_buckets_temp_storage_bytes);
|
||||
// cub::DeviceRadixSort::SortPairsDescending(
|
||||
// sort_buckets_temp_storage, sort_buckets_temp_storage_bytes, sizes_d,
|
||||
// sorted_sizes, buckets_d, sorted_buckets, N, 0);
|
||||
// cudaFree(sort_buckets_temp_storage);
|
||||
|
||||
// cudaEventRecord(start, 0);
|
||||
|
||||
bucket_acc_naive<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
|
||||
// bucket_acc_compute_baseline<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
|
||||
// bucket_acc_memory_baseline<<<BLOCKS, THREADS>>>(buckets_d, buckets2_d, indices_d, N);
|
||||
// bucket_acc_reg<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
|
||||
// bucket_acc_reg<<<BLOCKS, THREADS>>>(buckets_d, sorted_indices, sorted_sizes, N);
|
||||
// bucket_acc_reg<<<BLOCKS, THREADS>>>(sorted_buckets, indices_d, sorted_sizes, N);
|
||||
|
||||
// simple_memory_copy<<<64, 32>>>(buckets_d, buckets2_d, N);
|
||||
// simple_memory_copy<<<BLOCKS, THREADS>>>(buckets_d, buckets2_d, N);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaStreamSynchronize(0);
|
||||
cudaEventElapsedTime(&kernel_time, start, stop);
|
||||
printf("kernel_time : %.3f ms.\n", kernel_time);
|
||||
|
||||
cudaMemcpy(buckets_h, buckets_d, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
|
||||
// cudaMemcpy(buckets_h, sorted_buckets, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
|
||||
// cudaMemcpy(sizes_h, sorted_indices, sizeof(unsigned) * N, cudaMemcpyDeviceToHost);
|
||||
|
||||
// printf("res:\n");
|
||||
// for (size_t i = 0; i < 8; i++)
|
||||
// {
|
||||
// std::cout << buckets_h[i] << "\n";
|
||||
// // std::cout << sizes_h[i] << "\n";
|
||||
// }
|
||||
// printf("\n");
|
||||
// printf("C test: ");
|
||||
// for (size_t i = 0; i < 8; i++)
|
||||
// {
|
||||
// std::cout << Cb_h[i] << ", ";
|
||||
// }
|
||||
// printf("\n");
|
||||
// printf("C ref: ");
|
||||
// for (size_t i = 0; i < 8; i++)
|
||||
// {
|
||||
// std::cout << C_d[i] << ", ";
|
||||
// // std::cout << C_h[i] << ", ";
|
||||
// }
|
||||
// printf("\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
199
icicle/src/mini-course-examples/test.cu
Normal file
199
icicle/src/mini-course-examples/test.cu
Normal file
@@ -0,0 +1,199 @@
|
||||
#include "fields/id.h"
|
||||
// #define FIELD_ID 2
|
||||
#define CURVE_ID 3
|
||||
#include "curves/curve_config.cuh"
|
||||
// #include "fields/field_config.cuh"
|
||||
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "fields/field.cuh"
|
||||
#include "curves/projective.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
|
||||
#include "kernels.cu"
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = 10;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
{
|
||||
return {(p1.x + p2.x) % p1.p};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static HOST_INLINE Dummy_Scalar rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
typedef curve_config::scalar_t test_scalar;
|
||||
typedef curve_config::projective_t test_projective;
|
||||
typedef curve_config::affine_t test_affine;
|
||||
|
||||
// typedef Dummy_Scalar test_t;
|
||||
// typedef test_projective test_t;
|
||||
typedef test_scalar test_t;
|
||||
|
||||
void queryGPUProperties() {
|
||||
int deviceCount = 0;
|
||||
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
||||
|
||||
if (error_id != cudaSuccess) {
|
||||
std::cerr << "cudaGetDeviceCount returned " << static_cast<int>(error_id) << " -> " << cudaGetErrorString(error_id) << std::endl;
|
||||
std::cerr << "Result = FAIL" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (deviceCount == 0) {
|
||||
std::cout << "There are no available device(s) that support CUDA." << std::endl;
|
||||
} else {
|
||||
std::cout << "Detected " << deviceCount << " CUDA Capable device(s)." << std::endl;
|
||||
}
|
||||
|
||||
for (int dev = 0; dev < deviceCount; ++dev) {
|
||||
cudaSetDevice(dev);
|
||||
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, dev);
|
||||
|
||||
std::cout << "Device " << dev << ": \"" << deviceProp.name << "\"" << std::endl;
|
||||
std::cout << " CUDA Capability Major/Minor version number: " << deviceProp.major << "." << deviceProp.minor << std::endl;
|
||||
std::cout << " Total amount of global memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
|
||||
std::cout << " Number of multiprocessors: " << deviceProp.multiProcessorCount << std::endl;
|
||||
std::cout << " Total amount of global memory: " << deviceProp.totalGlobalMem << " bytes" << std::endl;
|
||||
std::cout << " Total amount of shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;
|
||||
std::cout << " Total amount of shared memory per multiprocessor: " << deviceProp.sharedMemPerMultiprocessor << " bytes" << std::endl;
|
||||
std::cout << " Total number of registers available per block: " << deviceProp.regsPerBlock << std::endl;
|
||||
std::cout << " Total number of registers available per multiprocessor: " << deviceProp.regsPerMultiprocessor << std::endl;
|
||||
std::cout << " Warp size: " << deviceProp.warpSize << std::endl;
|
||||
std::cout << " Maximum number of threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
|
||||
std::cout << " Maximum number of threads per multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
|
||||
std::cout << " Maximum sizes of each dimension of a block: " << deviceProp.maxThreadsDim[0] << " x "
|
||||
<< deviceProp.maxThreadsDim[1] << " x " << deviceProp.maxThreadsDim[2] << std::endl;
|
||||
std::cout << " Maximum sizes of each dimension of a grid: " << deviceProp.maxGridSize[0] << " x "
|
||||
<< deviceProp.maxGridSize[1] << " x " << deviceProp.maxGridSize[2] << std::endl;
|
||||
std::cout << " Clock rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl;
|
||||
std::cout << " Memory clock rate: " << deviceProp.memoryClockRate / 1000 << " MHz" << std::endl;
|
||||
std::cout << " Memory bus width: " << deviceProp.memoryBusWidth << " bits" << std::endl;
|
||||
std::cout << " Peak memory bandwidth: "
|
||||
<< 2.0 * deviceProp.memoryClockRate * (deviceProp.memoryBusWidth / 8) / 1.0e6 << " GB/s" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
queryGPUProperties();
|
||||
|
||||
int N = 1<<20;
|
||||
// int N = 300;
|
||||
|
||||
test_t* A_h = new test_t[N];
|
||||
test_t* B_h = new test_t[N];
|
||||
test_t* C_h = new test_t[N];
|
||||
test_t* Cb_h = new test_t[N];
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
A_h[i] = i<100? test_t::rand_host() : A_h[i-100];
|
||||
B_h[i] = i<100? test_t::rand_host() : B_h[i-100];
|
||||
}
|
||||
|
||||
test_t *A_d,*B_d,*C_d;
|
||||
test_t *Cb_d;
|
||||
|
||||
|
||||
cudaMalloc(&A_d, sizeof(test_t) * N);
|
||||
cudaMalloc(&B_d, sizeof(test_t) * N);
|
||||
cudaMalloc(&C_d, sizeof(test_t) * N);
|
||||
cudaMalloc(&Cb_d, sizeof(test_t) * N);
|
||||
|
||||
cudaMemcpy(A_d, A_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(B_d, B_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
|
||||
|
||||
// int THREADS = 256;
|
||||
// int BLOCKS = (N + THREADS - 1)/THREADS;
|
||||
// add_elements_kernel<<<BLOCKS, THREADS>>>(A_d, B_d, C_d, N);
|
||||
// cudaDeviceSynchronize();
|
||||
// // printf("cuda error %d\n", cudaGetLastError());
|
||||
// std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
// THREADS = 256;
|
||||
// BLOCKS = (N + THREADS - 1)/THREADS;
|
||||
// bugged_add_elements_kernel<<<BLOCKS, THREADS>>>(A_d, B_d, Cb_d, N);
|
||||
// cudaDeviceSynchronize();
|
||||
// // printf("cuda error %d\n", cudaGetLastError());
|
||||
// std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
// int THREADS = 128;
|
||||
// int BLOCKS = (N/4 + THREADS - 1)/THREADS;
|
||||
// // fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS>>>(A_d, C_d, N);
|
||||
// fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS*4>>>(A_d, C_d, N/4);
|
||||
// cudaDeviceSynchronize();
|
||||
// // printf("cuda error %d\n", cudaGetLastError());
|
||||
// std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
// THREADS = 128;
|
||||
// BLOCKS = (N/4 + THREADS - 1)/THREADS;
|
||||
// // fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS>>>(A_d, C_d, N);
|
||||
// bugged_fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS*4>>>(A_d, Cb_d, N/4);
|
||||
// // bugged_fake_ntt_kernel<<<1, 1, sizeof(test_t)*THREADS*4>>>(A_d, Cb_d, N/4);
|
||||
// cudaDeviceSynchronize();
|
||||
// // printf("cuda error %d\n", cudaGetLastError());
|
||||
// std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
cudaMemcpy(C_h, C_d, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Cb_h, Cb_d, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
|
||||
|
||||
// printf("A: ");
|
||||
// for (size_t i = 0; i < 8; i++)
|
||||
// {
|
||||
// std::cout << A_h[i] << ", ";
|
||||
// }
|
||||
// printf("\n");
|
||||
// printf("C test: ");
|
||||
// for (size_t i = 0; i < 8; i++)
|
||||
// {
|
||||
// std::cout << Cb_h[i] << ", ";
|
||||
// }
|
||||
// printf("\n");
|
||||
// printf("C ref: ");
|
||||
// for (size_t i = 0; i < 8; i++)
|
||||
// {
|
||||
// std::cout << C_d[i] << ", ";
|
||||
// // std::cout << C_h[i] << ", ";
|
||||
// }
|
||||
// printf("\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
123
icicle/src/mini-course-examples/transpose_test.cu
Normal file
123
icicle/src/mini-course-examples/transpose_test.cu
Normal file
@@ -0,0 +1,123 @@
|
||||
#include "fields/id.h"
|
||||
#define FIELD_ID 1001
|
||||
// #define CURVE_ID 3
|
||||
// #include "curves/curve_config.cuh"
|
||||
#include "fields/field_config.cuh"
|
||||
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <cub/device/device_radix_sort.cuh>
|
||||
|
||||
#include "fields/field.cuh"
|
||||
#include "curves/projective.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
|
||||
#include "kernels.cu"
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
public:
|
||||
static constexpr unsigned NBITS = 32;
|
||||
|
||||
unsigned x;
|
||||
unsigned p = 10;
|
||||
// unsigned p = 1<<30;
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
|
||||
|
||||
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
|
||||
{
|
||||
os << scalar.x;
|
||||
return os;
|
||||
}
|
||||
|
||||
HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
|
||||
{
|
||||
return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
|
||||
{
|
||||
return {(p1.x + p2.x) % p1.p};
|
||||
}
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
|
||||
|
||||
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
|
||||
|
||||
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
|
||||
static HOST_INLINE Dummy_Scalar rand_host()
|
||||
{
|
||||
return {(unsigned)rand() % 10};
|
||||
// return {(unsigned)rand()};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
typedef field_config::scalar_t test_scalar;
|
||||
// typedef curve_config::scalar_t test_scalar;
|
||||
// typedef curve_config::projective_t test_projective;
|
||||
// typedef curve_config::affine_t test_affine;
|
||||
|
||||
// typedef int test_t;
|
||||
// typedef int4 test_t;
|
||||
// typedef Dummy_Scalar test_t;
|
||||
// typedef test_projective test_t;
|
||||
typedef test_scalar test_t;
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
float kernel_time;
|
||||
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
int N = 1<<11;
|
||||
int N2 = N*N;
|
||||
|
||||
test_t* arr1_h = new test_t[N2];
|
||||
test_t* arr2_h = new test_t[N2];
|
||||
|
||||
test_t *arr1_d, *arr2_d;
|
||||
|
||||
cudaMalloc(&arr1_d, N2*sizeof(test_t));
|
||||
cudaMalloc(&arr2_d, N2*sizeof(test_t));
|
||||
|
||||
for (int i = 0; i < N2; i++)
|
||||
{
|
||||
arr1_h[i] = i > 100? arr1_h[i-100] : test_t::rand_host();
|
||||
}
|
||||
|
||||
cudaMemcpy(arr1_d, arr1_h, sizeof(test_t) * N2, cudaMemcpyHostToDevice);
|
||||
|
||||
int THREADS = 256;
|
||||
int BLOCKS = (N2 + THREADS - 1)/THREADS;
|
||||
|
||||
//warm up
|
||||
simple_memory_copy<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N2);
|
||||
shmem_transpose<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
|
||||
cudaEventRecord(start, 0);
|
||||
|
||||
simple_memory_copy<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N2);
|
||||
// naive_transpose_write<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
// naive_transpose_read<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
// shmem_transpose<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaStreamSynchronize(0);
|
||||
cudaEventElapsedTime(&kernel_time, start, stop);
|
||||
printf("kernel_time : %.3f ms.\n", kernel_time);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user