[FEAT]: Golang Bindings for pinned host memory (#519)

## Describe the changes

This PR adds the capability to pin host memory in golang bindings
allowing data transfers to be quicker. Memory can be pinned once for
multiple devices by passing the flag
`cuda_runtime.CudaHostRegisterPortable` or
`cuda_runtime.CudaHostAllocPortable` depending on how pinned memory is
called
This commit is contained in:
Jeremy Felder
2024-06-24 14:03:44 +03:00
committed by GitHub
parent 7831f7bd0f
commit 2b07513310
25 changed files with 868 additions and 48 deletions

View File

@@ -197,6 +197,35 @@ func (h HostSlice[T]) AsUnsafePointer() unsafe.Pointer {
return unsafe.Pointer(&h[0])
}
// Registers host memory as pinned, allowing the GPU to read data from the host quicker and save GPU memory space.
// Memory pinned using this function should be unpinned using [Unpin]
func (h HostSlice[T]) Pin(flags cr.RegisterPinnedFlags) cr.CudaError {
_, err := cr.RegisterPinned(h.AsUnsafePointer(), h.SizeOfElement()*h.Len(), flags)
return err
}
// Unregisters host memory as pinned
func (h HostSlice[T]) Unpin() cr.CudaError {
return cr.FreeRegisteredPinned(h.AsUnsafePointer())
}
// Allocates new host memory as pinned and copies the HostSlice data to the newly allocated area
// Memory pinned using this function should be unpinned using [FreePinned]
func (h HostSlice[T]) AllocPinned(flags cr.AllocPinnedFlags) (HostSlice[T], cr.CudaError) {
pinnedMemPointer, err := cr.AllocPinned(h.SizeOfElement()*h.Len(), flags)
if err != cr.CudaSuccess {
return nil, err
}
pinnedMem := unsafe.Slice((*T)(pinnedMemPointer), h.Len())
copy(pinnedMem, h)
return pinnedMem, cr.CudaSuccess
}
// Unpins host memory that was pinned using [AllocPinned]
func (h HostSlice[T]) FreePinned() cr.CudaError {
return cr.FreeAllocPinned(h.AsUnsafePointer())
}
func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *DeviceSlice {
size := h.Len() * h.SizeOfElement()
if shouldAllocate {

View File

@@ -6,6 +6,7 @@ import (
"unsafe"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core/internal"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
"github.com/stretchr/testify/assert"
)
@@ -222,3 +223,26 @@ func TestSliceRanges(t *testing.T) {
hostSliceRange.CopyFromDevice(&deviceSliceRange)
assert.Equal(t, hostSlice[2:6], hostSliceRange)
}
func TestHostSlicePinning(t *testing.T) {
data := []int{1, 2, 3, 4, 5, 7, 8, 9}
dataHostSlice := HostSliceFromElements(data)
err := dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
assert.Equal(t, cuda_runtime.CudaSuccess, err)
err = dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
assert.Equal(t, cuda_runtime.CudaErrorHostMemoryAlreadyRegistered, err)
err = dataHostSlice.Unpin()
assert.Equal(t, cuda_runtime.CudaSuccess, err)
err = dataHostSlice.Unpin()
assert.Equal(t, cuda_runtime.CudaErrorHostMemoryNotRegistered, err)
pinnedMem, err := dataHostSlice.AllocPinned(cuda_runtime.CudaHostAllocDefault)
assert.Equal(t, cuda_runtime.CudaSuccess, err)
assert.ElementsMatch(t, dataHostSlice, pinnedMem)
err = pinnedMem.FreePinned()
assert.Equal(t, cuda_runtime.CudaSuccess, err)
err = pinnedMem.FreePinned()
assert.Equal(t, cuda_runtime.CudaErrorInvalidValue, err)
}

View File

@@ -177,3 +177,160 @@ const (
// CudaMemcpyDefault as declared in include/driver_types.h:1223
CudaMemcpyDefault CudaMemcpyKind = 4
)
type AllocPinnedFlags = C.uint
// CudaErrorT enumeration from include/driver_types.h:85-88
const (
CudaHostAllocDefault AllocPinnedFlags = 0x00 /**< Default page-locked allocation flag */
CudaHostAllocPortable AllocPinnedFlags = 0x01 /**< Pinned memory accessible by all CUDA contexts */
// Currently not supported
// CudaHostAllocMapped AllocPinnedFlags = 0x02 /**< Map allocation into device space */
// CudaHostAllocWriteCombined AllocPinnedFlags = 0x04 /**< Write-combined memory */
)
type RegisterPinnedFlags = C.uint
// CudaErrorT enumeration from include/driver_types.h:90-94
const (
CudaHostRegisterDefault RegisterPinnedFlags = 0x00 /**< Default host memory registration flag */
CudaHostRegisterPortable RegisterPinnedFlags = 0x01 /**< Pinned memory accessible by all CUDA contexts */
// Currently not supported
// CudaHostRegisterMapped RegisterPinnedFlags = 0x02 /**< Map registered memory into device space */
// cudaHostRegisterIoMemory RegisterPinnedFlags = 0x04 /**< Memory-mapped I/O space */
// cudaHostRegisterReadOnly RegisterPinnedFlags = 0x08 /**< Memory-mapped read-only */
)
type DeviceAttribute = uint32
const (
CudaDevAttrMaxThreadsPerBlock DeviceAttribute = 1 /**< Maximum number of threads per block */
CudaDevAttrMaxBlockDimX DeviceAttribute = 2 /**< Maximum block dimension X */
CudaDevAttrMaxBlockDimY DeviceAttribute = 3 /**< Maximum block dimension Y */
CudaDevAttrMaxBlockDimZ DeviceAttribute = 4 /**< Maximum block dimension Z */
CudaDevAttrMaxGridDimX DeviceAttribute = 5 /**< Maximum grid dimension X */
CudaDevAttrMaxGridDimY DeviceAttribute = 6 /**< Maximum grid dimension Y */
CudaDevAttrMaxGridDimZ DeviceAttribute = 7 /**< Maximum grid dimension Z */
CudaDevAttrMaxSharedMemoryPerBlock DeviceAttribute = 8 /**< Maximum shared memory available per block in bytes */
CudaDevAttrTotalConstantMemory DeviceAttribute = 9 /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
CudaDevAttrWarpSize DeviceAttribute = 10 /**< Warp size in threads */
CudaDevAttrMaxPitch DeviceAttribute = 11 /**< Maximum pitch in bytes allowed by memory copies */
CudaDevAttrMaxRegistersPerBlock DeviceAttribute = 12 /**< Maximum number of 32-bit registers available per block */
CudaDevAttrClockRate DeviceAttribute = 13 /**< Peak clock frequency in kilohertz */
CudaDevAttrTextureAlignment DeviceAttribute = 14 /**< Alignment requirement for textures */
CudaDevAttrGpuOverlap DeviceAttribute = 15 /**< Device can possibly copy memory and execute a kernel concurrently */
CudaDevAttrMultiProcessorCount DeviceAttribute = 16 /**< Number of multiprocessors on device */
CudaDevAttrKernelExecTimeout DeviceAttribute = 17 /**< Specifies whether there is a run time limit on kernels */
CudaDevAttrIntegrated DeviceAttribute = 18 /**< Device is integrated with host memory */
CudaDevAttrCanMapHostMemory DeviceAttribute = 19 /**< Device can map host memory into CUDA address space */
CudaDevAttrComputeMode DeviceAttribute = 20 /**< Compute mode (See ::cudaComputeMode for details) */
CudaDevAttrMaxTexture1DWidth DeviceAttribute = 21 /**< Maximum 1D texture width */
CudaDevAttrMaxTexture2DWidth DeviceAttribute = 22 /**< Maximum 2D texture width */
CudaDevAttrMaxTexture2DHeight DeviceAttribute = 23 /**< Maximum 2D texture height */
CudaDevAttrMaxTexture3DWidth DeviceAttribute = 24 /**< Maximum 3D texture width */
CudaDevAttrMaxTexture3DHeight DeviceAttribute = 25 /**< Maximum 3D texture height */
CudaDevAttrMaxTexture3DDepth DeviceAttribute = 26 /**< Maximum 3D texture depth */
CudaDevAttrMaxTexture2DLayeredWidth DeviceAttribute = 27 /**< Maximum 2D layered texture width */
CudaDevAttrMaxTexture2DLayeredHeight DeviceAttribute = 28 /**< Maximum 2D layered texture height */
CudaDevAttrMaxTexture2DLayeredLayers DeviceAttribute = 29 /**< Maximum layers in a 2D layered texture */
CudaDevAttrSurfaceAlignment DeviceAttribute = 30 /**< Alignment requirement for surfaces */
CudaDevAttrConcurrentKernels DeviceAttribute = 31 /**< Device can possibly execute multiple kernels concurrently */
CudaDevAttrEccEnabled DeviceAttribute = 32 /**< Device has ECC support enabled */
CudaDevAttrPciBusId DeviceAttribute = 33 /**< PCI bus ID of the device */
CudaDevAttrPciDeviceId DeviceAttribute = 34 /**< PCI device ID of the device */
CudaDevAttrTccDriver DeviceAttribute = 35 /**< Device is using TCC driver model */
CudaDevAttrMemoryClockRate DeviceAttribute = 36 /**< Peak memory clock frequency in kilohertz */
CudaDevAttrGlobalMemoryBusWidth DeviceAttribute = 37 /**< Global memory bus width in bits */
CudaDevAttrL2CacheSize DeviceAttribute = 38 /**< Size of L2 cache in bytes */
CudaDevAttrMaxThreadsPerMultiProcessor DeviceAttribute = 39 /**< Maximum resident threads per multiprocessor */
CudaDevAttrAsyncEngineCount DeviceAttribute = 40 /**< Number of asynchronous engines */
CudaDevAttrUnifiedAddressing DeviceAttribute = 41 /**< Device shares a unified address space with the host */
CudaDevAttrMaxTexture1DLayeredWidth DeviceAttribute = 42 /**< Maximum 1D layered texture width */
CudaDevAttrMaxTexture1DLayeredLayers DeviceAttribute = 43 /**< Maximum layers in a 1D layered texture */
CudaDevAttrMaxTexture2DGatherWidth DeviceAttribute = 45 /**< Maximum 2D texture width if cudaArrayTextureGather is set */
CudaDevAttrMaxTexture2DGatherHeight DeviceAttribute = 46 /**< Maximum 2D texture height if cudaArrayTextureGather is set */
CudaDevAttrMaxTexture3DWidthAlt DeviceAttribute = 47 /**< Alternate maximum 3D texture width */
CudaDevAttrMaxTexture3DHeightAlt DeviceAttribute = 48 /**< Alternate maximum 3D texture height */
CudaDevAttrMaxTexture3DDepthAlt DeviceAttribute = 49 /**< Alternate maximum 3D texture depth */
CudaDevAttrPciDomainId DeviceAttribute = 50 /**< PCI domain ID of the device */
CudaDevAttrTexturePitchAlignment DeviceAttribute = 51 /**< Pitch alignment requirement for textures */
CudaDevAttrMaxTextureCubemapWidth DeviceAttribute = 52 /**< Maximum cubemap texture width/height */
CudaDevAttrMaxTextureCubemapLayeredWidth DeviceAttribute = 53 /**< Maximum cubemap layered texture width/height */
CudaDevAttrMaxTextureCubemapLayeredLayers DeviceAttribute = 54 /**< Maximum layers in a cubemap layered texture */
CudaDevAttrMaxSurface1DWidth DeviceAttribute = 55 /**< Maximum 1D surface width */
CudaDevAttrMaxSurface2DWidth DeviceAttribute = 56 /**< Maximum 2D surface width */
CudaDevAttrMaxSurface2DHeight DeviceAttribute = 57 /**< Maximum 2D surface height */
CudaDevAttrMaxSurface3DWidth DeviceAttribute = 58 /**< Maximum 3D surface width */
CudaDevAttrMaxSurface3DHeight DeviceAttribute = 59 /**< Maximum 3D surface height */
CudaDevAttrMaxSurface3DDepth DeviceAttribute = 60 /**< Maximum 3D surface depth */
CudaDevAttrMaxSurface1DLayeredWidth DeviceAttribute = 61 /**< Maximum 1D layered surface width */
CudaDevAttrMaxSurface1DLayeredLayers DeviceAttribute = 62 /**< Maximum layers in a 1D layered surface */
CudaDevAttrMaxSurface2DLayeredWidth DeviceAttribute = 63 /**< Maximum 2D layered surface width */
CudaDevAttrMaxSurface2DLayeredHeight DeviceAttribute = 64 /**< Maximum 2D layered surface height */
CudaDevAttrMaxSurface2DLayeredLayers DeviceAttribute = 65 /**< Maximum layers in a 2D layered surface */
CudaDevAttrMaxSurfaceCubemapWidth DeviceAttribute = 66 /**< Maximum cubemap surface width */
CudaDevAttrMaxSurfaceCubemapLayeredWidth DeviceAttribute = 67 /**< Maximum cubemap layered surface width */
CudaDevAttrMaxSurfaceCubemapLayeredLayers DeviceAttribute = 68 /**< Maximum layers in a cubemap layered surface */
CudaDevAttrMaxTexture1DLinearWidth DeviceAttribute = 69 /**< Maximum 1D linear texture width */
CudaDevAttrMaxTexture2DLinearWidth DeviceAttribute = 70 /**< Maximum 2D linear texture width */
CudaDevAttrMaxTexture2DLinearHeight DeviceAttribute = 71 /**< Maximum 2D linear texture height */
CudaDevAttrMaxTexture2DLinearPitch DeviceAttribute = 72 /**< Maximum 2D linear texture pitch in bytes */
CudaDevAttrMaxTexture2DMipmappedWidth DeviceAttribute = 73 /**< Maximum mipmapped 2D texture width */
CudaDevAttrMaxTexture2DMipmappedHeight DeviceAttribute = 74 /**< Maximum mipmapped 2D texture height */
CudaDevAttrComputeCapabilityMajor DeviceAttribute = 75 /**< Major compute capability version number */
CudaDevAttrComputeCapabilityMinor DeviceAttribute = 76 /**< Minor compute capability version number */
CudaDevAttrMaxTexture1DMipmappedWidth DeviceAttribute = 77 /**< Maximum mipmapped 1D texture width */
CudaDevAttrStreamPrioritiesSupported DeviceAttribute = 78 /**< Device supports stream priorities */
CudaDevAttrGlobalL1CacheSupported DeviceAttribute = 79 /**< Device supports caching globals in L1 */
CudaDevAttrLocalL1CacheSupported DeviceAttribute = 80 /**< Device supports caching locals in L1 */
CudaDevAttrMaxSharedMemoryPerMultiprocessor DeviceAttribute = 81 /**< Maximum shared memory available per multiprocessor in bytes */
CudaDevAttrMaxRegistersPerMultiprocessor DeviceAttribute = 82 /**< Maximum number of 32-bit registers available per multiprocessor */
CudaDevAttrManagedMemory DeviceAttribute = 83 /**< Device can allocate managed memory on this system */
CudaDevAttrIsMultiGpuBoard DeviceAttribute = 84 /**< Device is on a multi-GPU board */
CudaDevAttrMultiGpuBoardGroupID DeviceAttribute = 85 /**< Unique identifier for a group of devices on the same multi-GPU board */
CudaDevAttrHostNativeAtomicSupported DeviceAttribute = 86 /**< Link between the device and the host supports native atomic operations */
CudaDevAttrSingleToDoublePrecisionPerfRatio DeviceAttribute = 87 /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
CudaDevAttrPageableMemoryAccess DeviceAttribute = 88 /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
CudaDevAttrConcurrentManagedAccess DeviceAttribute = 89 /**< Device can coherently access managed memory concurrently with the CPU */
CudaDevAttrComputePreemptionSupported DeviceAttribute = 90 /**< Device supports Compute Preemption */
CudaDevAttrCanUseHostPointerForRegisteredMem DeviceAttribute = 91 /**< Device can access host registered memory at the same virtual address as the CPU */
CudaDevAttrReserved92 DeviceAttribute = 92
CudaDevAttrReserved93 DeviceAttribute = 93
CudaDevAttrReserved94 DeviceAttribute = 94
CudaDevAttrCooperativeLaunch DeviceAttribute = 95 /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/
CudaDevAttrCooperativeMultiDeviceLaunch DeviceAttribute = 96 /**< Deprecated cudaLaunchCooperativeKernelMultiDevice is deprecated. */
CudaDevAttrMaxSharedMemoryPerBlockOptin DeviceAttribute = 97 /**< The maximum opt-in shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */
CudaDevAttrCanFlushRemoteWrites DeviceAttribute = 98 /**< Device supports flushing of outstanding remote writes. */
CudaDevAttrHostRegisterSupported DeviceAttribute = 99 /**< Device supports host memory registration via ::cudaHostRegister. */
CudaDevAttrPageableMemoryAccessUsesHostPageTables DeviceAttribute = 100 /**< Device accesses pageable memory via the host's page tables. */
CudaDevAttrDirectManagedMemAccessFromHost DeviceAttribute = 101 /**< Host can directly access managed memory on the device without migration. */
CudaDevAttrMaxBlocksPerMultiprocessor DeviceAttribute = 106 /**< Maximum number of blocks per multiprocessor */
CudaDevAttrMaxPersistingL2CacheSize DeviceAttribute = 108 /**< Maximum L2 persisting lines capacity setting in bytes. */
CudaDevAttrMaxAccessPolicyWindowSize DeviceAttribute = 109 /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */
CudaDevAttrReservedSharedMemoryPerBlock DeviceAttribute = 111 /**< Shared memory reserved by CUDA driver per block in bytes */
CudaDevAttrSparseCudaArraySupported DeviceAttribute = 112 /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
CudaDevAttrHostRegisterReadOnlySupported DeviceAttribute = 113 /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
CudaDevAttrTimelineSemaphoreInteropSupported DeviceAttribute = 114 /**< External timeline semaphore interop is supported on the device */
CudaDevAttrMaxTimelineSemaphoreInteropSupported DeviceAttribute = 114 /**< Deprecated External timeline semaphore interop is supported on the device */
CudaDevAttrMemoryPoolsSupported DeviceAttribute = 115 /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */
CudaDevAttrGPUDirectRDMASupported DeviceAttribute = 116 /**< Device supports GPUDirect RDMA APIs like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
CudaDevAttrGPUDirectRDMAFlushWritesOptions DeviceAttribute = 117 /**< The returned attribute shall be interpreted as a bitmask where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */
CudaDevAttrGPUDirectRDMAWritesOrdering DeviceAttribute = 118 /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */
CudaDevAttrMemoryPoolSupportedHandleTypes DeviceAttribute = 119 /**< Handle types supported with mempool based IPC */
CudaDevAttrClusterLaunch DeviceAttribute = 120 /**< Indicates device supports cluster launch */
CudaDevAttrDeferredMappingCudaArraySupported DeviceAttribute = 121 /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
CudaDevAttrReserved122 DeviceAttribute = 122
CudaDevAttrReserved123 DeviceAttribute = 123
CudaDevAttrReserved124 DeviceAttribute = 124
CudaDevAttrIpcEventSupport DeviceAttribute = 125 /**< Device supports IPC Events. */
CudaDevAttrMemSyncDomainCount DeviceAttribute = 126 /**< Number of memory synchronization domains the device supports. */
CudaDevAttrReserved127 DeviceAttribute = 127
CudaDevAttrReserved128 DeviceAttribute = 128
CudaDevAttrReserved129 DeviceAttribute = 129
CudaDevAttrNumaConfig DeviceAttribute = 130 /**< NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum */
CudaDevAttrNumaId DeviceAttribute = 131 /**< NUMA node ID of the GPU memory */
CudaDevAttrReserved132 DeviceAttribute = 132
CudaDevAttrMpsEnabled DeviceAttribute = 133 /**< Contexts created on this device will be shared via MPS */
CudaDevAttrHostNumaId DeviceAttribute = 134 /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
CudaDevAttrMax DeviceAttribute = 135
)

View File

@@ -74,6 +74,14 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
return int(cCudaPointerAttributes.device)
}
func GetDeviceAttribute(attr DeviceAttribute, device int) int {
var res int
cRes := (*C.int)(unsafe.Pointer(&res))
cDevice := (C.int)(device)
C.cudaDeviceGetAttribute(cRes, attr, cDevice)
return res
}
// RunOnDevice forces the provided function to run all GPU related calls within it
// on the same host thread and therefore the same GPU device.
//
@@ -84,46 +92,46 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
//
// As an example:
//
// cr.RunOnDevice(i, func(args ...any) {
// defer wg.Done()
// cfg := GetDefaultMSMConfig()
// stream, _ := cr.CreateStream()
// for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
// size := 1 << power
//
// // This will always print "Inner goroutine device: 0"
// // go func () {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // }()
// // To force the above goroutine to same device as the wrapping function:
// // RunOnDevice(i, func(arg ...any) {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // })
//
// scalars := GenerateScalars(size)
// points := GenerateAffinePoints(size)
//
// var p Projective
// var out core.DeviceSlice
// _, e := out.MallocAsync(p.Size(), p.Size(), stream)
// assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
// cfg.Ctx.Stream = &stream
// cfg.IsAsync = true
//
// e = Msm(scalars, points, &cfg, out)
// assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
//
// outHost := make(core.HostSlice[Projective], 1)
//
// cr.SynchronizeStream(&stream)
// outHost.CopyFromDevice(&out)
// out.Free()
// // Check with gnark-crypto
// assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
// }
// }, i)
// cr.RunOnDevice(i, func(args ...any) {
// defer wg.Done()
// cfg := GetDefaultMSMConfig()
// stream, _ := cr.CreateStream()
// for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
// size := 1 << power
// // This will always print "Inner goroutine device: 0"
// // go func () {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // }()
// // To force the above goroutine to same device as the wrapping function:
// // RunOnDevice(i, func(arg ...any) {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // })
// scalars := GenerateScalars(size)
// points := GenerateAffinePoints(size)
// var p Projective
// var out core.DeviceSlice
// _, e := out.MallocAsync(p.Size(), p.Size(), stream)
// assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
// cfg.Ctx.Stream = &stream
// cfg.IsAsync = true
// e = Msm(scalars, points, &cfg, out)
// assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
// outHost := make(core.HostSlice[Projective], 1)
// cr.SynchronizeStream(&stream)
// outHost.CopyFromDevice(&out)
// out.Free()
// // Check with gnark-crypto
// assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
// }
// }, i)
func RunOnDevice(deviceId int, funcToRun func(args ...any), args ...any) {
go func(id int) {
defer runtime.UnlockOSThread()

View File

@@ -9,6 +9,7 @@ package cuda_runtime
import "C"
import (
// "runtime"
"unsafe"
)
@@ -58,6 +59,45 @@ func FreeAsync(devicePtr unsafe.Pointer, stream Stream) CudaError {
return err
}
func AllocPinned(size int, flags AllocPinnedFlags) (unsafe.Pointer, CudaError) {
cSize := (C.size_t)(size)
var hostPtr unsafe.Pointer
ret := C.cudaHostAlloc(&hostPtr, cSize, flags)
err := (CudaError)(ret)
if err != CudaSuccess {
return nil, err
}
return hostPtr, CudaSuccess
}
func GetHostFlags(ptr unsafe.Pointer) (flag uint) {
cFlag := (C.uint)(flag)
C.cudaHostGetFlags(&cFlag, ptr)
return
}
func FreeAllocPinned(hostPtr unsafe.Pointer) CudaError {
return (CudaError)(C.cudaFreeHost(hostPtr))
}
func RegisterPinned(hostPtr unsafe.Pointer, size int, flags RegisterPinnedFlags) (unsafe.Pointer, CudaError) {
cSize := (C.size_t)(size)
// This is required since there are greater values of RegisterPinnedFlags which we do not support currently
flags = flags & 3
ret := C.cudaHostRegister(hostPtr, cSize, flags)
err := (CudaError)(ret)
if err != CudaSuccess {
return nil, err
}
return hostPtr, CudaSuccess
}
func FreeRegisteredPinned(hostPtr unsafe.Pointer) CudaError {
return (CudaError)(C.cudaHostUnregister(hostPtr))
}
func CopyFromDevice(hostDst, deviceSrc unsafe.Pointer, size uint) (unsafe.Pointer, CudaError) {
cCount := (C.size_t)(size)
ret := C.cudaMemcpy(hostDst, deviceSrc, cCount, uint32(CudaMemcpyDeviceToHost))

View File

@@ -40,3 +40,27 @@ func TestCopyFromToHost(t *testing.T) {
assert.Equal(t, CudaSuccess, err, "Couldn't copy to device due to %v", err)
assert.Equal(t, someInts, someInts2, "Elements of host slices do not match. Copying from/to host failed")
}
func TestRegisterUnregisterPinned(t *testing.T) {
data := []int{1, 2, 3, 4, 5, 7, 8, 9}
dataUnsafe := unsafe.Pointer(&data[0])
_, err := RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
assert.Equal(t, CudaSuccess, err)
_, err = RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
assert.Equal(t, CudaErrorHostMemoryAlreadyRegistered, err)
err = FreeRegisteredPinned(dataUnsafe)
assert.Equal(t, CudaSuccess, err)
err = FreeRegisteredPinned(dataUnsafe)
assert.Equal(t, CudaErrorHostMemoryNotRegistered, err)
}
func TestAllocFreePinned(t *testing.T) {
pinnedMemPointer, err := AllocPinned(int(unsafe.Sizeof(1)*9), CudaHostAllocDefault)
assert.Equal(t, CudaSuccess, err)
err = FreeAllocPinned(pinnedMemPointer)
assert.Equal(t, CudaSuccess, err)
err = FreeAllocPinned(pinnedMemPointer)
assert.Equal(t, CudaErrorInvalidValue, err)
}

View File

@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {
}
}
func TestMSMG2PinnedHostMemory(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBls12_377.GenerateScalars(size)
points := g2.G2GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[g2.G2Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p g2.G2Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[g2.G2Projective], 1)
e = g2.G2Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
if pinnableAndLockable {
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
}
}
func TestMSMPinnedHostMemory(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBls12_377.GenerateScalars(size)
points := icicleBls12_377.GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[icicleBls12_377.Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p icicleBls12_377.Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[icicleBls12_377.Projective], 1)
e = msm.Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
if pinnableAndLockable {
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMGnarkCryptoTypes(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
func TestNttBatch(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := bls12_377.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {
}
}
func TestMSMG2PinnedHostMemory(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBls12_381.GenerateScalars(size)
points := g2.G2GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[g2.G2Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p g2.G2Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[g2.G2Projective], 1)
e = g2.G2Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
if pinnableAndLockable {
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
}
}
func TestMSMPinnedHostMemory(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBls12_381.GenerateScalars(size)
points := icicleBls12_381.GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[icicleBls12_381.Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p icicleBls12_381.Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[icicleBls12_381.Projective], 1)
e = msm.Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
if pinnableAndLockable {
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMGnarkCryptoTypes(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
func TestNttBatch(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := bls12_381.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {
}
}
func TestMSMG2PinnedHostMemory(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBn254.GenerateScalars(size)
points := g2.G2GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[g2.G2Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p g2.G2Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[g2.G2Projective], 1)
e = g2.G2Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
if pinnableAndLockable {
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
}
}
func TestMSMPinnedHostMemory(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBn254.GenerateScalars(size)
points := icicleBn254.GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[icicleBn254.Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p icicleBn254.Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[icicleBn254.Projective], 1)
e = msm.Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
if pinnableAndLockable {
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMGnarkCryptoTypes(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
func TestNttBatch(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := bn254.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -106,6 +106,58 @@ func TestMSMG2(t *testing.T) {
}
}
func TestMSMG2PinnedHostMemory(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBw6_761.GenerateScalars(size)
points := g2.G2GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[g2.G2Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p g2.G2Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[g2.G2Projective], 1)
e = g2.G2Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
if pinnableAndLockable {
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
}
}
func TestMSMPinnedHostMemory(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleBw6_761.GenerateScalars(size)
points := icicleBw6_761.GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[icicleBw6_761.Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p icicleBw6_761.Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[icicleBw6_761.Projective], 1)
e = msm.Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
if pinnableAndLockable {
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMGnarkCryptoTypes(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{3} {

View File

@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
func TestNttBatch(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := bw6_761.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -40,6 +40,54 @@ func TestMSM(t *testing.T) {
}
}
func TestMSMPinnedHostMemory(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicleGrumpkin.GenerateScalars(size)
points := icicleGrumpkin.GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[icicleGrumpkin.Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p icicleGrumpkin.Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[icicleGrumpkin.Projective], 1)
e = msm.Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
if pinnableAndLockable {
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMBatch(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{10, 16} {

View File

@@ -61,11 +61,12 @@ func TestNttDeviceAsyncNoDomain(t *testing.T) {
func TestNttBatchNoDomain(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := babybear_extension.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -80,11 +80,12 @@ func TestNttDeviceAsync(t *testing.T) {
func TestNttBatch(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := babybear.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -185,6 +185,61 @@ func TestMSM{{.CurvePrefix}}(t *testing.T) {
{{end}}
}
}
func TestMSM{{.CurvePrefix}}PinnedHostMemory(t *testing.T) {
cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power
scalars := icicle{{capitalize .Curve}}.GenerateScalars(size)
points := {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}GenerateAffinePoints(size)
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
pinnableAndLockable := pinnable == 1 && lockable == 0
var pinnedPoints core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}
var p {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective], 1)
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
outHost.CopyFromDevice(&out)
{{if ne .GnarkImport "" -}}
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm{{.CurvePrefix}}(scalars, points, outHost[0]))
{{end}}
if pinnableAndLockable {
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
outHost.CopyFromDevice(&out)
{{if ne .GnarkImport "" -}}
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm{{.CurvePrefix}}(scalars, pinnedPoints, outHost[0]))
{{end}}
}
out.Free()
if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
{{if ne .GnarkImport "" -}}
func TestMSM{{if eq .CurvePrefix "G2"}}G2{{end}}GnarkCryptoTypes(t *testing.T) {
cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()

View File

@@ -64,11 +64,12 @@ func TestNttDeviceAsyncNoDomain(t *testing.T) {
func TestNttBatchNoDomain(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := {{.FieldNoDomain}}.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -163,11 +163,12 @@ func TestNttDeviceAsync(t *testing.T) {
func TestNttBatch(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := {{.Field}}.GenerateScalars(1 << largestTestSize * largestBatchSize)
for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

View File

@@ -53,6 +53,8 @@ fn main() {
.allowlist_function("cudaGetDevice")
.allowlist_function("cudaSetDevice")
.allowlist_function("cudaGetDeviceCount")
.allowlist_function("cudaDeviceGetAttribute")
.rustified_enum("cudaDeviceAttr")
// error handling
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
.allowlist_function("cudaGetLastError")
@@ -79,6 +81,15 @@ fn main() {
.allowlist_function("cudaDeviceGetDefaultMemPool")
.allowlist_function("cudaMemGetInfo")
.rustified_enum("cudaMemcpyKind")
.allowlist_function("cudaHostAlloc")
.allowlist_var("cudaHostAllocDefault")
.allowlist_var("cudaHostAllocPortable")
.allowlist_function("cudaFreeHost")
.allowlist_function("cudaHostGetFlags")
.allowlist_function("cudaHostRegister")
.allowlist_var("cudaHostRegisterDefault")
.allowlist_var("cudaHostRegisterPortable")
.allowlist_function("cudaHostUnregister")
// Stream Ordered Memory Allocator
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
.allowlist_function("cudaFreeAsync")