mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-09 13:07:59 -05:00
[FEAT]: Golang Bindings for pinned host memory (#519)
## Describe the changes This PR adds the capability to pin host memory in golang bindings allowing data transfers to be quicker. Memory can be pinned once for multiple devices by passing the flag `cuda_runtime.CudaHostRegisterPortable` or `cuda_runtime.CudaHostAllocPortable` depending on how pinned memory is called
This commit is contained in:
@@ -197,6 +197,35 @@ func (h HostSlice[T]) AsUnsafePointer() unsafe.Pointer {
|
||||
return unsafe.Pointer(&h[0])
|
||||
}
|
||||
|
||||
// Registers host memory as pinned, allowing the GPU to read data from the host quicker and save GPU memory space.
|
||||
// Memory pinned using this function should be unpinned using [Unpin]
|
||||
func (h HostSlice[T]) Pin(flags cr.RegisterPinnedFlags) cr.CudaError {
|
||||
_, err := cr.RegisterPinned(h.AsUnsafePointer(), h.SizeOfElement()*h.Len(), flags)
|
||||
return err
|
||||
}
|
||||
|
||||
// Unregisters host memory as pinned
|
||||
func (h HostSlice[T]) Unpin() cr.CudaError {
|
||||
return cr.FreeRegisteredPinned(h.AsUnsafePointer())
|
||||
}
|
||||
|
||||
// Allocates new host memory as pinned and copies the HostSlice data to the newly allocated area
|
||||
// Memory pinned using this function should be unpinned using [FreePinned]
|
||||
func (h HostSlice[T]) AllocPinned(flags cr.AllocPinnedFlags) (HostSlice[T], cr.CudaError) {
|
||||
pinnedMemPointer, err := cr.AllocPinned(h.SizeOfElement()*h.Len(), flags)
|
||||
if err != cr.CudaSuccess {
|
||||
return nil, err
|
||||
}
|
||||
pinnedMem := unsafe.Slice((*T)(pinnedMemPointer), h.Len())
|
||||
copy(pinnedMem, h)
|
||||
return pinnedMem, cr.CudaSuccess
|
||||
}
|
||||
|
||||
// Unpins host memory that was pinned using [AllocPinned]
|
||||
func (h HostSlice[T]) FreePinned() cr.CudaError {
|
||||
return cr.FreeAllocPinned(h.AsUnsafePointer())
|
||||
}
|
||||
|
||||
func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *DeviceSlice {
|
||||
size := h.Len() * h.SizeOfElement()
|
||||
if shouldAllocate {
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"unsafe"
|
||||
|
||||
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core/internal"
|
||||
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
@@ -222,3 +223,26 @@ func TestSliceRanges(t *testing.T) {
|
||||
hostSliceRange.CopyFromDevice(&deviceSliceRange)
|
||||
assert.Equal(t, hostSlice[2:6], hostSliceRange)
|
||||
}
|
||||
|
||||
func TestHostSlicePinning(t *testing.T) {
|
||||
data := []int{1, 2, 3, 4, 5, 7, 8, 9}
|
||||
dataHostSlice := HostSliceFromElements(data)
|
||||
err := dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
|
||||
assert.Equal(t, cuda_runtime.CudaSuccess, err)
|
||||
err = dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
|
||||
assert.Equal(t, cuda_runtime.CudaErrorHostMemoryAlreadyRegistered, err)
|
||||
|
||||
err = dataHostSlice.Unpin()
|
||||
assert.Equal(t, cuda_runtime.CudaSuccess, err)
|
||||
err = dataHostSlice.Unpin()
|
||||
assert.Equal(t, cuda_runtime.CudaErrorHostMemoryNotRegistered, err)
|
||||
|
||||
pinnedMem, err := dataHostSlice.AllocPinned(cuda_runtime.CudaHostAllocDefault)
|
||||
assert.Equal(t, cuda_runtime.CudaSuccess, err)
|
||||
assert.ElementsMatch(t, dataHostSlice, pinnedMem)
|
||||
|
||||
err = pinnedMem.FreePinned()
|
||||
assert.Equal(t, cuda_runtime.CudaSuccess, err)
|
||||
err = pinnedMem.FreePinned()
|
||||
assert.Equal(t, cuda_runtime.CudaErrorInvalidValue, err)
|
||||
}
|
||||
|
||||
@@ -177,3 +177,160 @@ const (
|
||||
// CudaMemcpyDefault as declared in include/driver_types.h:1223
|
||||
CudaMemcpyDefault CudaMemcpyKind = 4
|
||||
)
|
||||
|
||||
type AllocPinnedFlags = C.uint
|
||||
|
||||
// CudaErrorT enumeration from include/driver_types.h:85-88
|
||||
const (
|
||||
CudaHostAllocDefault AllocPinnedFlags = 0x00 /**< Default page-locked allocation flag */
|
||||
CudaHostAllocPortable AllocPinnedFlags = 0x01 /**< Pinned memory accessible by all CUDA contexts */
|
||||
// Currently not supported
|
||||
// CudaHostAllocMapped AllocPinnedFlags = 0x02 /**< Map allocation into device space */
|
||||
// CudaHostAllocWriteCombined AllocPinnedFlags = 0x04 /**< Write-combined memory */
|
||||
)
|
||||
|
||||
type RegisterPinnedFlags = C.uint
|
||||
|
||||
// CudaErrorT enumeration from include/driver_types.h:90-94
|
||||
const (
|
||||
CudaHostRegisterDefault RegisterPinnedFlags = 0x00 /**< Default host memory registration flag */
|
||||
CudaHostRegisterPortable RegisterPinnedFlags = 0x01 /**< Pinned memory accessible by all CUDA contexts */
|
||||
// Currently not supported
|
||||
// CudaHostRegisterMapped RegisterPinnedFlags = 0x02 /**< Map registered memory into device space */
|
||||
// cudaHostRegisterIoMemory RegisterPinnedFlags = 0x04 /**< Memory-mapped I/O space */
|
||||
// cudaHostRegisterReadOnly RegisterPinnedFlags = 0x08 /**< Memory-mapped read-only */
|
||||
)
|
||||
|
||||
type DeviceAttribute = uint32
|
||||
|
||||
const (
|
||||
CudaDevAttrMaxThreadsPerBlock DeviceAttribute = 1 /**< Maximum number of threads per block */
|
||||
CudaDevAttrMaxBlockDimX DeviceAttribute = 2 /**< Maximum block dimension X */
|
||||
CudaDevAttrMaxBlockDimY DeviceAttribute = 3 /**< Maximum block dimension Y */
|
||||
CudaDevAttrMaxBlockDimZ DeviceAttribute = 4 /**< Maximum block dimension Z */
|
||||
CudaDevAttrMaxGridDimX DeviceAttribute = 5 /**< Maximum grid dimension X */
|
||||
CudaDevAttrMaxGridDimY DeviceAttribute = 6 /**< Maximum grid dimension Y */
|
||||
CudaDevAttrMaxGridDimZ DeviceAttribute = 7 /**< Maximum grid dimension Z */
|
||||
CudaDevAttrMaxSharedMemoryPerBlock DeviceAttribute = 8 /**< Maximum shared memory available per block in bytes */
|
||||
CudaDevAttrTotalConstantMemory DeviceAttribute = 9 /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
|
||||
CudaDevAttrWarpSize DeviceAttribute = 10 /**< Warp size in threads */
|
||||
CudaDevAttrMaxPitch DeviceAttribute = 11 /**< Maximum pitch in bytes allowed by memory copies */
|
||||
CudaDevAttrMaxRegistersPerBlock DeviceAttribute = 12 /**< Maximum number of 32-bit registers available per block */
|
||||
CudaDevAttrClockRate DeviceAttribute = 13 /**< Peak clock frequency in kilohertz */
|
||||
CudaDevAttrTextureAlignment DeviceAttribute = 14 /**< Alignment requirement for textures */
|
||||
CudaDevAttrGpuOverlap DeviceAttribute = 15 /**< Device can possibly copy memory and execute a kernel concurrently */
|
||||
CudaDevAttrMultiProcessorCount DeviceAttribute = 16 /**< Number of multiprocessors on device */
|
||||
CudaDevAttrKernelExecTimeout DeviceAttribute = 17 /**< Specifies whether there is a run time limit on kernels */
|
||||
CudaDevAttrIntegrated DeviceAttribute = 18 /**< Device is integrated with host memory */
|
||||
CudaDevAttrCanMapHostMemory DeviceAttribute = 19 /**< Device can map host memory into CUDA address space */
|
||||
CudaDevAttrComputeMode DeviceAttribute = 20 /**< Compute mode (See ::cudaComputeMode for details) */
|
||||
CudaDevAttrMaxTexture1DWidth DeviceAttribute = 21 /**< Maximum 1D texture width */
|
||||
CudaDevAttrMaxTexture2DWidth DeviceAttribute = 22 /**< Maximum 2D texture width */
|
||||
CudaDevAttrMaxTexture2DHeight DeviceAttribute = 23 /**< Maximum 2D texture height */
|
||||
CudaDevAttrMaxTexture3DWidth DeviceAttribute = 24 /**< Maximum 3D texture width */
|
||||
CudaDevAttrMaxTexture3DHeight DeviceAttribute = 25 /**< Maximum 3D texture height */
|
||||
CudaDevAttrMaxTexture3DDepth DeviceAttribute = 26 /**< Maximum 3D texture depth */
|
||||
CudaDevAttrMaxTexture2DLayeredWidth DeviceAttribute = 27 /**< Maximum 2D layered texture width */
|
||||
CudaDevAttrMaxTexture2DLayeredHeight DeviceAttribute = 28 /**< Maximum 2D layered texture height */
|
||||
CudaDevAttrMaxTexture2DLayeredLayers DeviceAttribute = 29 /**< Maximum layers in a 2D layered texture */
|
||||
CudaDevAttrSurfaceAlignment DeviceAttribute = 30 /**< Alignment requirement for surfaces */
|
||||
CudaDevAttrConcurrentKernels DeviceAttribute = 31 /**< Device can possibly execute multiple kernels concurrently */
|
||||
CudaDevAttrEccEnabled DeviceAttribute = 32 /**< Device has ECC support enabled */
|
||||
CudaDevAttrPciBusId DeviceAttribute = 33 /**< PCI bus ID of the device */
|
||||
CudaDevAttrPciDeviceId DeviceAttribute = 34 /**< PCI device ID of the device */
|
||||
CudaDevAttrTccDriver DeviceAttribute = 35 /**< Device is using TCC driver model */
|
||||
CudaDevAttrMemoryClockRate DeviceAttribute = 36 /**< Peak memory clock frequency in kilohertz */
|
||||
CudaDevAttrGlobalMemoryBusWidth DeviceAttribute = 37 /**< Global memory bus width in bits */
|
||||
CudaDevAttrL2CacheSize DeviceAttribute = 38 /**< Size of L2 cache in bytes */
|
||||
CudaDevAttrMaxThreadsPerMultiProcessor DeviceAttribute = 39 /**< Maximum resident threads per multiprocessor */
|
||||
CudaDevAttrAsyncEngineCount DeviceAttribute = 40 /**< Number of asynchronous engines */
|
||||
CudaDevAttrUnifiedAddressing DeviceAttribute = 41 /**< Device shares a unified address space with the host */
|
||||
CudaDevAttrMaxTexture1DLayeredWidth DeviceAttribute = 42 /**< Maximum 1D layered texture width */
|
||||
CudaDevAttrMaxTexture1DLayeredLayers DeviceAttribute = 43 /**< Maximum layers in a 1D layered texture */
|
||||
CudaDevAttrMaxTexture2DGatherWidth DeviceAttribute = 45 /**< Maximum 2D texture width if cudaArrayTextureGather is set */
|
||||
CudaDevAttrMaxTexture2DGatherHeight DeviceAttribute = 46 /**< Maximum 2D texture height if cudaArrayTextureGather is set */
|
||||
CudaDevAttrMaxTexture3DWidthAlt DeviceAttribute = 47 /**< Alternate maximum 3D texture width */
|
||||
CudaDevAttrMaxTexture3DHeightAlt DeviceAttribute = 48 /**< Alternate maximum 3D texture height */
|
||||
CudaDevAttrMaxTexture3DDepthAlt DeviceAttribute = 49 /**< Alternate maximum 3D texture depth */
|
||||
CudaDevAttrPciDomainId DeviceAttribute = 50 /**< PCI domain ID of the device */
|
||||
CudaDevAttrTexturePitchAlignment DeviceAttribute = 51 /**< Pitch alignment requirement for textures */
|
||||
CudaDevAttrMaxTextureCubemapWidth DeviceAttribute = 52 /**< Maximum cubemap texture width/height */
|
||||
CudaDevAttrMaxTextureCubemapLayeredWidth DeviceAttribute = 53 /**< Maximum cubemap layered texture width/height */
|
||||
CudaDevAttrMaxTextureCubemapLayeredLayers DeviceAttribute = 54 /**< Maximum layers in a cubemap layered texture */
|
||||
CudaDevAttrMaxSurface1DWidth DeviceAttribute = 55 /**< Maximum 1D surface width */
|
||||
CudaDevAttrMaxSurface2DWidth DeviceAttribute = 56 /**< Maximum 2D surface width */
|
||||
CudaDevAttrMaxSurface2DHeight DeviceAttribute = 57 /**< Maximum 2D surface height */
|
||||
CudaDevAttrMaxSurface3DWidth DeviceAttribute = 58 /**< Maximum 3D surface width */
|
||||
CudaDevAttrMaxSurface3DHeight DeviceAttribute = 59 /**< Maximum 3D surface height */
|
||||
CudaDevAttrMaxSurface3DDepth DeviceAttribute = 60 /**< Maximum 3D surface depth */
|
||||
CudaDevAttrMaxSurface1DLayeredWidth DeviceAttribute = 61 /**< Maximum 1D layered surface width */
|
||||
CudaDevAttrMaxSurface1DLayeredLayers DeviceAttribute = 62 /**< Maximum layers in a 1D layered surface */
|
||||
CudaDevAttrMaxSurface2DLayeredWidth DeviceAttribute = 63 /**< Maximum 2D layered surface width */
|
||||
CudaDevAttrMaxSurface2DLayeredHeight DeviceAttribute = 64 /**< Maximum 2D layered surface height */
|
||||
CudaDevAttrMaxSurface2DLayeredLayers DeviceAttribute = 65 /**< Maximum layers in a 2D layered surface */
|
||||
CudaDevAttrMaxSurfaceCubemapWidth DeviceAttribute = 66 /**< Maximum cubemap surface width */
|
||||
CudaDevAttrMaxSurfaceCubemapLayeredWidth DeviceAttribute = 67 /**< Maximum cubemap layered surface width */
|
||||
CudaDevAttrMaxSurfaceCubemapLayeredLayers DeviceAttribute = 68 /**< Maximum layers in a cubemap layered surface */
|
||||
CudaDevAttrMaxTexture1DLinearWidth DeviceAttribute = 69 /**< Maximum 1D linear texture width */
|
||||
CudaDevAttrMaxTexture2DLinearWidth DeviceAttribute = 70 /**< Maximum 2D linear texture width */
|
||||
CudaDevAttrMaxTexture2DLinearHeight DeviceAttribute = 71 /**< Maximum 2D linear texture height */
|
||||
CudaDevAttrMaxTexture2DLinearPitch DeviceAttribute = 72 /**< Maximum 2D linear texture pitch in bytes */
|
||||
CudaDevAttrMaxTexture2DMipmappedWidth DeviceAttribute = 73 /**< Maximum mipmapped 2D texture width */
|
||||
CudaDevAttrMaxTexture2DMipmappedHeight DeviceAttribute = 74 /**< Maximum mipmapped 2D texture height */
|
||||
CudaDevAttrComputeCapabilityMajor DeviceAttribute = 75 /**< Major compute capability version number */
|
||||
CudaDevAttrComputeCapabilityMinor DeviceAttribute = 76 /**< Minor compute capability version number */
|
||||
CudaDevAttrMaxTexture1DMipmappedWidth DeviceAttribute = 77 /**< Maximum mipmapped 1D texture width */
|
||||
CudaDevAttrStreamPrioritiesSupported DeviceAttribute = 78 /**< Device supports stream priorities */
|
||||
CudaDevAttrGlobalL1CacheSupported DeviceAttribute = 79 /**< Device supports caching globals in L1 */
|
||||
CudaDevAttrLocalL1CacheSupported DeviceAttribute = 80 /**< Device supports caching locals in L1 */
|
||||
CudaDevAttrMaxSharedMemoryPerMultiprocessor DeviceAttribute = 81 /**< Maximum shared memory available per multiprocessor in bytes */
|
||||
CudaDevAttrMaxRegistersPerMultiprocessor DeviceAttribute = 82 /**< Maximum number of 32-bit registers available per multiprocessor */
|
||||
CudaDevAttrManagedMemory DeviceAttribute = 83 /**< Device can allocate managed memory on this system */
|
||||
CudaDevAttrIsMultiGpuBoard DeviceAttribute = 84 /**< Device is on a multi-GPU board */
|
||||
CudaDevAttrMultiGpuBoardGroupID DeviceAttribute = 85 /**< Unique identifier for a group of devices on the same multi-GPU board */
|
||||
CudaDevAttrHostNativeAtomicSupported DeviceAttribute = 86 /**< Link between the device and the host supports native atomic operations */
|
||||
CudaDevAttrSingleToDoublePrecisionPerfRatio DeviceAttribute = 87 /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
|
||||
CudaDevAttrPageableMemoryAccess DeviceAttribute = 88 /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
|
||||
CudaDevAttrConcurrentManagedAccess DeviceAttribute = 89 /**< Device can coherently access managed memory concurrently with the CPU */
|
||||
CudaDevAttrComputePreemptionSupported DeviceAttribute = 90 /**< Device supports Compute Preemption */
|
||||
CudaDevAttrCanUseHostPointerForRegisteredMem DeviceAttribute = 91 /**< Device can access host registered memory at the same virtual address as the CPU */
|
||||
CudaDevAttrReserved92 DeviceAttribute = 92
|
||||
CudaDevAttrReserved93 DeviceAttribute = 93
|
||||
CudaDevAttrReserved94 DeviceAttribute = 94
|
||||
CudaDevAttrCooperativeLaunch DeviceAttribute = 95 /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/
|
||||
CudaDevAttrCooperativeMultiDeviceLaunch DeviceAttribute = 96 /**< Deprecated cudaLaunchCooperativeKernelMultiDevice is deprecated. */
|
||||
CudaDevAttrMaxSharedMemoryPerBlockOptin DeviceAttribute = 97 /**< The maximum opt-in shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */
|
||||
CudaDevAttrCanFlushRemoteWrites DeviceAttribute = 98 /**< Device supports flushing of outstanding remote writes. */
|
||||
CudaDevAttrHostRegisterSupported DeviceAttribute = 99 /**< Device supports host memory registration via ::cudaHostRegister. */
|
||||
CudaDevAttrPageableMemoryAccessUsesHostPageTables DeviceAttribute = 100 /**< Device accesses pageable memory via the host's page tables. */
|
||||
CudaDevAttrDirectManagedMemAccessFromHost DeviceAttribute = 101 /**< Host can directly access managed memory on the device without migration. */
|
||||
CudaDevAttrMaxBlocksPerMultiprocessor DeviceAttribute = 106 /**< Maximum number of blocks per multiprocessor */
|
||||
CudaDevAttrMaxPersistingL2CacheSize DeviceAttribute = 108 /**< Maximum L2 persisting lines capacity setting in bytes. */
|
||||
CudaDevAttrMaxAccessPolicyWindowSize DeviceAttribute = 109 /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */
|
||||
CudaDevAttrReservedSharedMemoryPerBlock DeviceAttribute = 111 /**< Shared memory reserved by CUDA driver per block in bytes */
|
||||
CudaDevAttrSparseCudaArraySupported DeviceAttribute = 112 /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
|
||||
CudaDevAttrHostRegisterReadOnlySupported DeviceAttribute = 113 /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
|
||||
CudaDevAttrTimelineSemaphoreInteropSupported DeviceAttribute = 114 /**< External timeline semaphore interop is supported on the device */
|
||||
CudaDevAttrMaxTimelineSemaphoreInteropSupported DeviceAttribute = 114 /**< Deprecated External timeline semaphore interop is supported on the device */
|
||||
CudaDevAttrMemoryPoolsSupported DeviceAttribute = 115 /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */
|
||||
CudaDevAttrGPUDirectRDMASupported DeviceAttribute = 116 /**< Device supports GPUDirect RDMA APIs like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
|
||||
CudaDevAttrGPUDirectRDMAFlushWritesOptions DeviceAttribute = 117 /**< The returned attribute shall be interpreted as a bitmask where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */
|
||||
CudaDevAttrGPUDirectRDMAWritesOrdering DeviceAttribute = 118 /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */
|
||||
CudaDevAttrMemoryPoolSupportedHandleTypes DeviceAttribute = 119 /**< Handle types supported with mempool based IPC */
|
||||
CudaDevAttrClusterLaunch DeviceAttribute = 120 /**< Indicates device supports cluster launch */
|
||||
CudaDevAttrDeferredMappingCudaArraySupported DeviceAttribute = 121 /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
|
||||
CudaDevAttrReserved122 DeviceAttribute = 122
|
||||
CudaDevAttrReserved123 DeviceAttribute = 123
|
||||
CudaDevAttrReserved124 DeviceAttribute = 124
|
||||
CudaDevAttrIpcEventSupport DeviceAttribute = 125 /**< Device supports IPC Events. */
|
||||
CudaDevAttrMemSyncDomainCount DeviceAttribute = 126 /**< Number of memory synchronization domains the device supports. */
|
||||
CudaDevAttrReserved127 DeviceAttribute = 127
|
||||
CudaDevAttrReserved128 DeviceAttribute = 128
|
||||
CudaDevAttrReserved129 DeviceAttribute = 129
|
||||
CudaDevAttrNumaConfig DeviceAttribute = 130 /**< NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum */
|
||||
CudaDevAttrNumaId DeviceAttribute = 131 /**< NUMA node ID of the GPU memory */
|
||||
CudaDevAttrReserved132 DeviceAttribute = 132
|
||||
CudaDevAttrMpsEnabled DeviceAttribute = 133 /**< Contexts created on this device will be shared via MPS */
|
||||
CudaDevAttrHostNumaId DeviceAttribute = 134 /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
|
||||
CudaDevAttrMax DeviceAttribute = 135
|
||||
)
|
||||
|
||||
@@ -74,6 +74,14 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
|
||||
return int(cCudaPointerAttributes.device)
|
||||
}
|
||||
|
||||
func GetDeviceAttribute(attr DeviceAttribute, device int) int {
|
||||
var res int
|
||||
cRes := (*C.int)(unsafe.Pointer(&res))
|
||||
cDevice := (C.int)(device)
|
||||
C.cudaDeviceGetAttribute(cRes, attr, cDevice)
|
||||
return res
|
||||
}
|
||||
|
||||
// RunOnDevice forces the provided function to run all GPU related calls within it
|
||||
// on the same host thread and therefore the same GPU device.
|
||||
//
|
||||
@@ -84,46 +92,46 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
|
||||
//
|
||||
// As an example:
|
||||
//
|
||||
// cr.RunOnDevice(i, func(args ...any) {
|
||||
// defer wg.Done()
|
||||
// cfg := GetDefaultMSMConfig()
|
||||
// stream, _ := cr.CreateStream()
|
||||
// for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
|
||||
// size := 1 << power
|
||||
//
|
||||
// // This will always print "Inner goroutine device: 0"
|
||||
// // go func () {
|
||||
// // device, _ := cr.GetDevice()
|
||||
// // fmt.Println("Inner goroutine device: ", device)
|
||||
// // }()
|
||||
// // To force the above goroutine to same device as the wrapping function:
|
||||
// // RunOnDevice(i, func(arg ...any) {
|
||||
// // device, _ := cr.GetDevice()
|
||||
// // fmt.Println("Inner goroutine device: ", device)
|
||||
// // })
|
||||
//
|
||||
// scalars := GenerateScalars(size)
|
||||
// points := GenerateAffinePoints(size)
|
||||
//
|
||||
// var p Projective
|
||||
// var out core.DeviceSlice
|
||||
// _, e := out.MallocAsync(p.Size(), p.Size(), stream)
|
||||
// assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
// cfg.Ctx.Stream = &stream
|
||||
// cfg.IsAsync = true
|
||||
//
|
||||
// e = Msm(scalars, points, &cfg, out)
|
||||
// assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
//
|
||||
// outHost := make(core.HostSlice[Projective], 1)
|
||||
//
|
||||
// cr.SynchronizeStream(&stream)
|
||||
// outHost.CopyFromDevice(&out)
|
||||
// out.Free()
|
||||
// // Check with gnark-crypto
|
||||
// assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
|
||||
// }
|
||||
// }, i)
|
||||
// cr.RunOnDevice(i, func(args ...any) {
|
||||
// defer wg.Done()
|
||||
// cfg := GetDefaultMSMConfig()
|
||||
// stream, _ := cr.CreateStream()
|
||||
// for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
|
||||
// size := 1 << power
|
||||
|
||||
// // This will always print "Inner goroutine device: 0"
|
||||
// // go func () {
|
||||
// // device, _ := cr.GetDevice()
|
||||
// // fmt.Println("Inner goroutine device: ", device)
|
||||
// // }()
|
||||
// // To force the above goroutine to same device as the wrapping function:
|
||||
// // RunOnDevice(i, func(arg ...any) {
|
||||
// // device, _ := cr.GetDevice()
|
||||
// // fmt.Println("Inner goroutine device: ", device)
|
||||
// // })
|
||||
|
||||
// scalars := GenerateScalars(size)
|
||||
// points := GenerateAffinePoints(size)
|
||||
|
||||
// var p Projective
|
||||
// var out core.DeviceSlice
|
||||
// _, e := out.MallocAsync(p.Size(), p.Size(), stream)
|
||||
// assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
// cfg.Ctx.Stream = &stream
|
||||
// cfg.IsAsync = true
|
||||
|
||||
// e = Msm(scalars, points, &cfg, out)
|
||||
// assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
|
||||
// outHost := make(core.HostSlice[Projective], 1)
|
||||
|
||||
// cr.SynchronizeStream(&stream)
|
||||
// outHost.CopyFromDevice(&out)
|
||||
// out.Free()
|
||||
// // Check with gnark-crypto
|
||||
// assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
|
||||
// }
|
||||
// }, i)
|
||||
func RunOnDevice(deviceId int, funcToRun func(args ...any), args ...any) {
|
||||
go func(id int) {
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
@@ -9,6 +9,7 @@ package cuda_runtime
|
||||
import "C"
|
||||
|
||||
import (
|
||||
// "runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
@@ -58,6 +59,45 @@ func FreeAsync(devicePtr unsafe.Pointer, stream Stream) CudaError {
|
||||
return err
|
||||
}
|
||||
|
||||
func AllocPinned(size int, flags AllocPinnedFlags) (unsafe.Pointer, CudaError) {
|
||||
cSize := (C.size_t)(size)
|
||||
var hostPtr unsafe.Pointer
|
||||
ret := C.cudaHostAlloc(&hostPtr, cSize, flags)
|
||||
err := (CudaError)(ret)
|
||||
if err != CudaSuccess {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hostPtr, CudaSuccess
|
||||
}
|
||||
|
||||
func GetHostFlags(ptr unsafe.Pointer) (flag uint) {
|
||||
cFlag := (C.uint)(flag)
|
||||
C.cudaHostGetFlags(&cFlag, ptr)
|
||||
return
|
||||
}
|
||||
|
||||
func FreeAllocPinned(hostPtr unsafe.Pointer) CudaError {
|
||||
return (CudaError)(C.cudaFreeHost(hostPtr))
|
||||
}
|
||||
|
||||
func RegisterPinned(hostPtr unsafe.Pointer, size int, flags RegisterPinnedFlags) (unsafe.Pointer, CudaError) {
|
||||
cSize := (C.size_t)(size)
|
||||
// This is required since there are greater values of RegisterPinnedFlags which we do not support currently
|
||||
flags = flags & 3
|
||||
ret := C.cudaHostRegister(hostPtr, cSize, flags)
|
||||
err := (CudaError)(ret)
|
||||
if err != CudaSuccess {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hostPtr, CudaSuccess
|
||||
}
|
||||
|
||||
func FreeRegisteredPinned(hostPtr unsafe.Pointer) CudaError {
|
||||
return (CudaError)(C.cudaHostUnregister(hostPtr))
|
||||
}
|
||||
|
||||
func CopyFromDevice(hostDst, deviceSrc unsafe.Pointer, size uint) (unsafe.Pointer, CudaError) {
|
||||
cCount := (C.size_t)(size)
|
||||
ret := C.cudaMemcpy(hostDst, deviceSrc, cCount, uint32(CudaMemcpyDeviceToHost))
|
||||
|
||||
@@ -40,3 +40,27 @@ func TestCopyFromToHost(t *testing.T) {
|
||||
assert.Equal(t, CudaSuccess, err, "Couldn't copy to device due to %v", err)
|
||||
assert.Equal(t, someInts, someInts2, "Elements of host slices do not match. Copying from/to host failed")
|
||||
}
|
||||
|
||||
func TestRegisterUnregisterPinned(t *testing.T) {
|
||||
data := []int{1, 2, 3, 4, 5, 7, 8, 9}
|
||||
dataUnsafe := unsafe.Pointer(&data[0])
|
||||
_, err := RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
|
||||
assert.Equal(t, CudaSuccess, err)
|
||||
_, err = RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
|
||||
assert.Equal(t, CudaErrorHostMemoryAlreadyRegistered, err)
|
||||
|
||||
err = FreeRegisteredPinned(dataUnsafe)
|
||||
assert.Equal(t, CudaSuccess, err)
|
||||
err = FreeRegisteredPinned(dataUnsafe)
|
||||
assert.Equal(t, CudaErrorHostMemoryNotRegistered, err)
|
||||
}
|
||||
|
||||
func TestAllocFreePinned(t *testing.T) {
|
||||
pinnedMemPointer, err := AllocPinned(int(unsafe.Sizeof(1)*9), CudaHostAllocDefault)
|
||||
assert.Equal(t, CudaSuccess, err)
|
||||
|
||||
err = FreeAllocPinned(pinnedMemPointer)
|
||||
assert.Equal(t, CudaSuccess, err)
|
||||
err = FreeAllocPinned(pinnedMemPointer)
|
||||
assert.Equal(t, CudaErrorInvalidValue, err)
|
||||
}
|
||||
|
||||
@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMG2PinnedHostMemory(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBls12_377.GenerateScalars(size)
|
||||
points := g2.G2GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[g2.G2Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], 1)
|
||||
|
||||
e = g2.G2Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMPinnedHostMemory(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBls12_377.GenerateScalars(size)
|
||||
points := icicleBls12_377.GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[icicleBls12_377.Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p icicleBls12_377.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[icicleBls12_377.Projective], 1)
|
||||
|
||||
e = msm.Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMGnarkCryptoTypes(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
|
||||
|
||||
func TestNttBatch(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := bls12_377.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMG2PinnedHostMemory(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBls12_381.GenerateScalars(size)
|
||||
points := g2.G2GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[g2.G2Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], 1)
|
||||
|
||||
e = g2.G2Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMPinnedHostMemory(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBls12_381.GenerateScalars(size)
|
||||
points := icicleBls12_381.GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[icicleBls12_381.Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p icicleBls12_381.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[icicleBls12_381.Projective], 1)
|
||||
|
||||
e = msm.Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMGnarkCryptoTypes(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
|
||||
|
||||
func TestNttBatch(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := bls12_381.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMG2PinnedHostMemory(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBn254.GenerateScalars(size)
|
||||
points := g2.G2GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[g2.G2Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], 1)
|
||||
|
||||
e = g2.G2Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMPinnedHostMemory(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBn254.GenerateScalars(size)
|
||||
points := icicleBn254.GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[icicleBn254.Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p icicleBn254.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[icicleBn254.Projective], 1)
|
||||
|
||||
e = msm.Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMGnarkCryptoTypes(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
|
||||
|
||||
func TestNttBatch(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := bn254.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -106,6 +106,58 @@ func TestMSMG2(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMG2PinnedHostMemory(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBw6_761.GenerateScalars(size)
|
||||
points := g2.G2GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[g2.G2Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], 1)
|
||||
|
||||
e = g2.G2Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMPinnedHostMemory(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleBw6_761.GenerateScalars(size)
|
||||
points := icicleBw6_761.GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[icicleBw6_761.Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p icicleBw6_761.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[icicleBw6_761.Projective], 1)
|
||||
|
||||
e = msm.Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestMSMGnarkCryptoTypes(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{3} {
|
||||
|
||||
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
|
||||
|
||||
func TestNttBatch(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := bw6_761.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -40,6 +40,54 @@ func TestMSM(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMPinnedHostMemory(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicleGrumpkin.GenerateScalars(size)
|
||||
points := icicleGrumpkin.GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[icicleGrumpkin.Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p icicleGrumpkin.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[icicleGrumpkin.Projective], 1)
|
||||
|
||||
e = msm.Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSMBatch(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
for _, power := range []int{10, 16} {
|
||||
|
||||
@@ -61,11 +61,12 @@ func TestNttDeviceAsyncNoDomain(t *testing.T) {
|
||||
|
||||
func TestNttBatchNoDomain(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := babybear_extension.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -80,11 +80,12 @@ func TestNttDeviceAsync(t *testing.T) {
|
||||
|
||||
func TestNttBatch(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := babybear.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -185,6 +185,61 @@ func TestMSM{{.CurvePrefix}}(t *testing.T) {
|
||||
{{end}}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMSM{{.CurvePrefix}}PinnedHostMemory(t *testing.T) {
|
||||
cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
|
||||
for _, power := range []int{10} {
|
||||
size := 1 << power
|
||||
|
||||
scalars := icicle{{capitalize .Curve}}.GenerateScalars(size)
|
||||
points := {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}GenerateAffinePoints(size)
|
||||
|
||||
pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
|
||||
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
|
||||
|
||||
pinnableAndLockable := pinnable == 1 && lockable == 0
|
||||
|
||||
var pinnedPoints core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Affine]
|
||||
if pinnableAndLockable {
|
||||
points.Pin(cr.CudaHostRegisterDefault)
|
||||
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
|
||||
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
|
||||
}
|
||||
|
||||
var p {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective
|
||||
var out core.DeviceSlice
|
||||
_, e := out.Malloc(p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
outHost := make(core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective], 1)
|
||||
|
||||
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, points, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
{{if ne .GnarkImport "" -}}
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm{{.CurvePrefix}}(scalars, points, outHost[0]))
|
||||
{{end}}
|
||||
|
||||
if pinnableAndLockable {
|
||||
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, pinnedPoints, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
|
||||
|
||||
outHost.CopyFromDevice(&out)
|
||||
{{if ne .GnarkImport "" -}}
|
||||
// Check with gnark-crypto
|
||||
assert.True(t, testAgainstGnarkCryptoMsm{{.CurvePrefix}}(scalars, pinnedPoints, outHost[0]))
|
||||
{{end}}
|
||||
}
|
||||
|
||||
out.Free()
|
||||
|
||||
if pinnableAndLockable {
|
||||
points.Unpin()
|
||||
pinnedPoints.FreePinned()
|
||||
}
|
||||
}
|
||||
}
|
||||
{{if ne .GnarkImport "" -}}
|
||||
func TestMSM{{if eq .CurvePrefix "G2"}}G2{{end}}GnarkCryptoTypes(t *testing.T) {
|
||||
cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
|
||||
|
||||
@@ -64,11 +64,12 @@ func TestNttDeviceAsyncNoDomain(t *testing.T) {
|
||||
|
||||
func TestNttBatchNoDomain(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := {{.FieldNoDomain}}.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -163,11 +163,12 @@ func TestNttDeviceAsync(t *testing.T) {
|
||||
|
||||
func TestNttBatch(t *testing.T) {
|
||||
cfg := ntt.GetDefaultNttConfig()
|
||||
largestTestSize := 12
|
||||
largestBatchSize := 100
|
||||
scalars := {{.Field}}.GenerateScalars(1 << largestTestSize * largestBatchSize)
|
||||
|
||||
for _, size := range []int{4, largestTestSize} {
|
||||
for _, batchSize := range []int{1, 16, largestBatchSize} {
|
||||
for _, batchSize := range []int{2, 16, largestBatchSize} {
|
||||
testSize := 1 << size
|
||||
totalSize := testSize * batchSize
|
||||
|
||||
|
||||
@@ -53,6 +53,8 @@ fn main() {
|
||||
.allowlist_function("cudaGetDevice")
|
||||
.allowlist_function("cudaSetDevice")
|
||||
.allowlist_function("cudaGetDeviceCount")
|
||||
.allowlist_function("cudaDeviceGetAttribute")
|
||||
.rustified_enum("cudaDeviceAttr")
|
||||
// error handling
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
|
||||
.allowlist_function("cudaGetLastError")
|
||||
@@ -79,6 +81,15 @@ fn main() {
|
||||
.allowlist_function("cudaDeviceGetDefaultMemPool")
|
||||
.allowlist_function("cudaMemGetInfo")
|
||||
.rustified_enum("cudaMemcpyKind")
|
||||
.allowlist_function("cudaHostAlloc")
|
||||
.allowlist_var("cudaHostAllocDefault")
|
||||
.allowlist_var("cudaHostAllocPortable")
|
||||
.allowlist_function("cudaFreeHost")
|
||||
.allowlist_function("cudaHostGetFlags")
|
||||
.allowlist_function("cudaHostRegister")
|
||||
.allowlist_var("cudaHostRegisterDefault")
|
||||
.allowlist_var("cudaHostRegisterPortable")
|
||||
.allowlist_function("cudaHostUnregister")
|
||||
// Stream Ordered Memory Allocator
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
|
||||
.allowlist_function("cudaFreeAsync")
|
||||
|
||||
Reference in New Issue
Block a user