[FEAT]: Golang Bindings for pinned host memory (#519)

## Describe the changes This PR adds the capability to pin host memory in golang bindings allowing data transfers to be quicker. Memory can be pinned once for multiple devices by passing the flag `cuda_runtime.CudaHostRegisterPortable` or `cuda_runtime.CudaHostAllocPortable` depending on how pinned memory is called
2026-01-09 13:07:59 -05:00 · 2024-06-24 14:03:44 +03:00
parent 7831f7bd0f
commit 2b07513310
25 changed files with 868 additions and 48 deletions
--- a/wrappers/golang/core/slice.go
+++ b/wrappers/golang/core/slice.go
@@ -197,6 +197,35 @@ func (h HostSlice[T]) AsUnsafePointer() unsafe.Pointer {
 	return unsafe.Pointer(&h[0])
 }

+// Registers host memory as pinned, allowing the GPU to read data from the host quicker and save GPU memory space.
+// Memory pinned using this function should be unpinned using [Unpin]
+func (h HostSlice[T]) Pin(flags cr.RegisterPinnedFlags) cr.CudaError {
+	_, err := cr.RegisterPinned(h.AsUnsafePointer(), h.SizeOfElement()*h.Len(), flags)
+	return err
+}
+
+// Unregisters host memory as pinned
+func (h HostSlice[T]) Unpin() cr.CudaError {
+	return cr.FreeRegisteredPinned(h.AsUnsafePointer())
+}
+
+// Allocates new host memory as pinned and copies the HostSlice data to the newly allocated area
+// Memory pinned using this function should be unpinned using [FreePinned]
+func (h HostSlice[T]) AllocPinned(flags cr.AllocPinnedFlags) (HostSlice[T], cr.CudaError) {
+	pinnedMemPointer, err := cr.AllocPinned(h.SizeOfElement()*h.Len(), flags)
+	if err != cr.CudaSuccess {
+		return nil, err
+	}
+	pinnedMem := unsafe.Slice((*T)(pinnedMemPointer), h.Len())
+	copy(pinnedMem, h)
+	return pinnedMem, cr.CudaSuccess
+}
+
+// Unpins host memory that was pinned using [AllocPinned]
+func (h HostSlice[T]) FreePinned() cr.CudaError {
+	return cr.FreeAllocPinned(h.AsUnsafePointer())
+}
+
 func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *DeviceSlice {
 	size := h.Len() * h.SizeOfElement()
 	if shouldAllocate {
--- a/wrappers/golang/core/slice_test.go
+++ b/wrappers/golang/core/slice_test.go
@@ -6,6 +6,7 @@ import (
 	"unsafe"

 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core/internal"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	"github.com/stretchr/testify/assert"
 )

@@ -222,3 +223,26 @@ func TestSliceRanges(t *testing.T) {
 	hostSliceRange.CopyFromDevice(&deviceSliceRange)
 	assert.Equal(t, hostSlice[2:6], hostSliceRange)
 }
+
+func TestHostSlicePinning(t *testing.T) {
+	data := []int{1, 2, 3, 4, 5, 7, 8, 9}
+	dataHostSlice := HostSliceFromElements(data)
+	err := dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	err = dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
+	assert.Equal(t, cuda_runtime.CudaErrorHostMemoryAlreadyRegistered, err)
+
+	err = dataHostSlice.Unpin()
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	err = dataHostSlice.Unpin()
+	assert.Equal(t, cuda_runtime.CudaErrorHostMemoryNotRegistered, err)
+
+	pinnedMem, err := dataHostSlice.AllocPinned(cuda_runtime.CudaHostAllocDefault)
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	assert.ElementsMatch(t, dataHostSlice, pinnedMem)
+
+	err = pinnedMem.FreePinned()
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	err = pinnedMem.FreePinned()
+	assert.Equal(t, cuda_runtime.CudaErrorInvalidValue, err)
+}
--- a/wrappers/golang/cuda_runtime/const.go
+++ b/wrappers/golang/cuda_runtime/const.go
@@ -177,3 +177,160 @@ const (
 	// CudaMemcpyDefault as declared in include/driver_types.h:1223
 	CudaMemcpyDefault CudaMemcpyKind = 4
 )
+
+type AllocPinnedFlags = C.uint
+
+// CudaErrorT enumeration from include/driver_types.h:85-88
+const (
+	CudaHostAllocDefault  AllocPinnedFlags = 0x00 /**< Default page-locked allocation flag */
+	CudaHostAllocPortable AllocPinnedFlags = 0x01 /**< Pinned memory accessible by all CUDA contexts */
+	// Currently not supported
+	// CudaHostAllocMapped AllocPinnedFlags 				= 0x02  /**< Map allocation into device space */
+	// CudaHostAllocWriteCombined AllocPinnedFlags = 0x04  /**< Write-combined memory */
+)
+
+type RegisterPinnedFlags = C.uint
+
+// CudaErrorT enumeration from include/driver_types.h:90-94
+const (
+	CudaHostRegisterDefault  RegisterPinnedFlags = 0x00 /**< Default host memory registration flag */
+	CudaHostRegisterPortable RegisterPinnedFlags = 0x01 /**< Pinned memory accessible by all CUDA contexts */
+	// Currently not supported
+	// CudaHostRegisterMapped RegisterPinnedFlags    = 0x02  /**< Map registered memory into device space */
+	// cudaHostRegisterIoMemory RegisterPinnedFlags  = 0x04  /**< Memory-mapped I/O space */
+	// cudaHostRegisterReadOnly RegisterPinnedFlags  = 0x08  /**< Memory-mapped read-only */
+)
+
+type DeviceAttribute = uint32
+
+const (
+	CudaDevAttrMaxThreadsPerBlock                     DeviceAttribute = 1  /**< Maximum number of threads per block */
+	CudaDevAttrMaxBlockDimX                           DeviceAttribute = 2  /**< Maximum block dimension X */
+	CudaDevAttrMaxBlockDimY                           DeviceAttribute = 3  /**< Maximum block dimension Y */
+	CudaDevAttrMaxBlockDimZ                           DeviceAttribute = 4  /**< Maximum block dimension Z */
+	CudaDevAttrMaxGridDimX                            DeviceAttribute = 5  /**< Maximum grid dimension X */
+	CudaDevAttrMaxGridDimY                            DeviceAttribute = 6  /**< Maximum grid dimension Y */
+	CudaDevAttrMaxGridDimZ                            DeviceAttribute = 7  /**< Maximum grid dimension Z */
+	CudaDevAttrMaxSharedMemoryPerBlock                DeviceAttribute = 8  /**< Maximum shared memory available per block in bytes */
+	CudaDevAttrTotalConstantMemory                    DeviceAttribute = 9  /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+	CudaDevAttrWarpSize                               DeviceAttribute = 10 /**< Warp size in threads */
+	CudaDevAttrMaxPitch                               DeviceAttribute = 11 /**< Maximum pitch in bytes allowed by memory copies */
+	CudaDevAttrMaxRegistersPerBlock                   DeviceAttribute = 12 /**< Maximum number of 32-bit registers available per block */
+	CudaDevAttrClockRate                              DeviceAttribute = 13 /**< Peak clock frequency in kilohertz */
+	CudaDevAttrTextureAlignment                       DeviceAttribute = 14 /**< Alignment requirement for textures */
+	CudaDevAttrGpuOverlap                             DeviceAttribute = 15 /**< Device can possibly copy memory and execute a kernel concurrently */
+	CudaDevAttrMultiProcessorCount                    DeviceAttribute = 16 /**< Number of multiprocessors on device */
+	CudaDevAttrKernelExecTimeout                      DeviceAttribute = 17 /**< Specifies whether there is a run time limit on kernels */
+	CudaDevAttrIntegrated                             DeviceAttribute = 18 /**< Device is integrated with host memory */
+	CudaDevAttrCanMapHostMemory                       DeviceAttribute = 19 /**< Device can map host memory into CUDA address space */
+	CudaDevAttrComputeMode                            DeviceAttribute = 20 /**< Compute mode (See ::cudaComputeMode for details) */
+	CudaDevAttrMaxTexture1DWidth                      DeviceAttribute = 21 /**< Maximum 1D texture width */
+	CudaDevAttrMaxTexture2DWidth                      DeviceAttribute = 22 /**< Maximum 2D texture width */
+	CudaDevAttrMaxTexture2DHeight                     DeviceAttribute = 23 /**< Maximum 2D texture height */
+	CudaDevAttrMaxTexture3DWidth                      DeviceAttribute = 24 /**< Maximum 3D texture width */
+	CudaDevAttrMaxTexture3DHeight                     DeviceAttribute = 25 /**< Maximum 3D texture height */
+	CudaDevAttrMaxTexture3DDepth                      DeviceAttribute = 26 /**< Maximum 3D texture depth */
+	CudaDevAttrMaxTexture2DLayeredWidth               DeviceAttribute = 27 /**< Maximum 2D layered texture width */
+	CudaDevAttrMaxTexture2DLayeredHeight              DeviceAttribute = 28 /**< Maximum 2D layered texture height */
+	CudaDevAttrMaxTexture2DLayeredLayers              DeviceAttribute = 29 /**< Maximum layers in a 2D layered texture */
+	CudaDevAttrSurfaceAlignment                       DeviceAttribute = 30 /**< Alignment requirement for surfaces */
+	CudaDevAttrConcurrentKernels                      DeviceAttribute = 31 /**< Device can possibly execute multiple kernels concurrently */
+	CudaDevAttrEccEnabled                             DeviceAttribute = 32 /**< Device has ECC support enabled */
+	CudaDevAttrPciBusId                               DeviceAttribute = 33 /**< PCI bus ID of the device */
+	CudaDevAttrPciDeviceId                            DeviceAttribute = 34 /**< PCI device ID of the device */
+	CudaDevAttrTccDriver                              DeviceAttribute = 35 /**< Device is using TCC driver model */
+	CudaDevAttrMemoryClockRate                        DeviceAttribute = 36 /**< Peak memory clock frequency in kilohertz */
+	CudaDevAttrGlobalMemoryBusWidth                   DeviceAttribute = 37 /**< Global memory bus width in bits */
+	CudaDevAttrL2CacheSize                            DeviceAttribute = 38 /**< Size of L2 cache in bytes */
+	CudaDevAttrMaxThreadsPerMultiProcessor            DeviceAttribute = 39 /**< Maximum resident threads per multiprocessor */
+	CudaDevAttrAsyncEngineCount                       DeviceAttribute = 40 /**< Number of asynchronous engines */
+	CudaDevAttrUnifiedAddressing                      DeviceAttribute = 41 /**< Device shares a unified address space with the host */
+	CudaDevAttrMaxTexture1DLayeredWidth               DeviceAttribute = 42 /**< Maximum 1D layered texture width */
+	CudaDevAttrMaxTexture1DLayeredLayers              DeviceAttribute = 43 /**< Maximum layers in a 1D layered texture */
+	CudaDevAttrMaxTexture2DGatherWidth                DeviceAttribute = 45 /**< Maximum 2D texture width if cudaArrayTextureGather is set */
+	CudaDevAttrMaxTexture2DGatherHeight               DeviceAttribute = 46 /**< Maximum 2D texture height if cudaArrayTextureGather is set */
+	CudaDevAttrMaxTexture3DWidthAlt                   DeviceAttribute = 47 /**< Alternate maximum 3D texture width */
+	CudaDevAttrMaxTexture3DHeightAlt                  DeviceAttribute = 48 /**< Alternate maximum 3D texture height */
+	CudaDevAttrMaxTexture3DDepthAlt                   DeviceAttribute = 49 /**< Alternate maximum 3D texture depth */
+	CudaDevAttrPciDomainId                            DeviceAttribute = 50 /**< PCI domain ID of the device */
+	CudaDevAttrTexturePitchAlignment                  DeviceAttribute = 51 /**< Pitch alignment requirement for textures */
+	CudaDevAttrMaxTextureCubemapWidth                 DeviceAttribute = 52 /**< Maximum cubemap texture width/height */
+	CudaDevAttrMaxTextureCubemapLayeredWidth          DeviceAttribute = 53 /**< Maximum cubemap layered texture width/height */
+	CudaDevAttrMaxTextureCubemapLayeredLayers         DeviceAttribute = 54 /**< Maximum layers in a cubemap layered texture */
+	CudaDevAttrMaxSurface1DWidth                      DeviceAttribute = 55 /**< Maximum 1D surface width */
+	CudaDevAttrMaxSurface2DWidth                      DeviceAttribute = 56 /**< Maximum 2D surface width */
+	CudaDevAttrMaxSurface2DHeight                     DeviceAttribute = 57 /**< Maximum 2D surface height */
+	CudaDevAttrMaxSurface3DWidth                      DeviceAttribute = 58 /**< Maximum 3D surface width */
+	CudaDevAttrMaxSurface3DHeight                     DeviceAttribute = 59 /**< Maximum 3D surface height */
+	CudaDevAttrMaxSurface3DDepth                      DeviceAttribute = 60 /**< Maximum 3D surface depth */
+	CudaDevAttrMaxSurface1DLayeredWidth               DeviceAttribute = 61 /**< Maximum 1D layered surface width */
+	CudaDevAttrMaxSurface1DLayeredLayers              DeviceAttribute = 62 /**< Maximum layers in a 1D layered surface */
+	CudaDevAttrMaxSurface2DLayeredWidth               DeviceAttribute = 63 /**< Maximum 2D layered surface width */
+	CudaDevAttrMaxSurface2DLayeredHeight              DeviceAttribute = 64 /**< Maximum 2D layered surface height */
+	CudaDevAttrMaxSurface2DLayeredLayers              DeviceAttribute = 65 /**< Maximum layers in a 2D layered surface */
+	CudaDevAttrMaxSurfaceCubemapWidth                 DeviceAttribute = 66 /**< Maximum cubemap surface width */
+	CudaDevAttrMaxSurfaceCubemapLayeredWidth          DeviceAttribute = 67 /**< Maximum cubemap layered surface width */
+	CudaDevAttrMaxSurfaceCubemapLayeredLayers         DeviceAttribute = 68 /**< Maximum layers in a cubemap layered surface */
+	CudaDevAttrMaxTexture1DLinearWidth                DeviceAttribute = 69 /**< Maximum 1D linear texture width */
+	CudaDevAttrMaxTexture2DLinearWidth                DeviceAttribute = 70 /**< Maximum 2D linear texture width */
+	CudaDevAttrMaxTexture2DLinearHeight               DeviceAttribute = 71 /**< Maximum 2D linear texture height */
+	CudaDevAttrMaxTexture2DLinearPitch                DeviceAttribute = 72 /**< Maximum 2D linear texture pitch in bytes */
+	CudaDevAttrMaxTexture2DMipmappedWidth             DeviceAttribute = 73 /**< Maximum mipmapped 2D texture width */
+	CudaDevAttrMaxTexture2DMipmappedHeight            DeviceAttribute = 74 /**< Maximum mipmapped 2D texture height */
+	CudaDevAttrComputeCapabilityMajor                 DeviceAttribute = 75 /**< Major compute capability version number */
+	CudaDevAttrComputeCapabilityMinor                 DeviceAttribute = 76 /**< Minor compute capability version number */
+	CudaDevAttrMaxTexture1DMipmappedWidth             DeviceAttribute = 77 /**< Maximum mipmapped 1D texture width */
+	CudaDevAttrStreamPrioritiesSupported              DeviceAttribute = 78 /**< Device supports stream priorities */
+	CudaDevAttrGlobalL1CacheSupported                 DeviceAttribute = 79 /**< Device supports caching globals in L1 */
+	CudaDevAttrLocalL1CacheSupported                  DeviceAttribute = 80 /**< Device supports caching locals in L1 */
+	CudaDevAttrMaxSharedMemoryPerMultiprocessor       DeviceAttribute = 81 /**< Maximum shared memory available per multiprocessor in bytes */
+	CudaDevAttrMaxRegistersPerMultiprocessor          DeviceAttribute = 82 /**< Maximum number of 32-bit registers available per multiprocessor */
+	CudaDevAttrManagedMemory                          DeviceAttribute = 83 /**< Device can allocate managed memory on this system */
+	CudaDevAttrIsMultiGpuBoard                        DeviceAttribute = 84 /**< Device is on a multi-GPU board */
+	CudaDevAttrMultiGpuBoardGroupID                   DeviceAttribute = 85 /**< Unique identifier for a group of devices on the same multi-GPU board */
+	CudaDevAttrHostNativeAtomicSupported              DeviceAttribute = 86 /**< Link between the device and the host supports native atomic operations */
+	CudaDevAttrSingleToDoublePrecisionPerfRatio       DeviceAttribute = 87 /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+	CudaDevAttrPageableMemoryAccess                   DeviceAttribute = 88 /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+	CudaDevAttrConcurrentManagedAccess                DeviceAttribute = 89 /**< Device can coherently access managed memory concurrently with the CPU */
+	CudaDevAttrComputePreemptionSupported             DeviceAttribute = 90 /**< Device supports Compute Preemption */
+	CudaDevAttrCanUseHostPointerForRegisteredMem      DeviceAttribute = 91 /**< Device can access host registered memory at the same virtual address as the CPU */
+	CudaDevAttrReserved92                             DeviceAttribute = 92
+	CudaDevAttrReserved93                             DeviceAttribute = 93
+	CudaDevAttrReserved94                             DeviceAttribute = 94
+	CudaDevAttrCooperativeLaunch                      DeviceAttribute = 95  /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/
+	CudaDevAttrCooperativeMultiDeviceLaunch           DeviceAttribute = 96  /**< Deprecated cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+	CudaDevAttrMaxSharedMemoryPerBlockOptin           DeviceAttribute = 97  /**< The maximum opt-in shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */
+	CudaDevAttrCanFlushRemoteWrites                   DeviceAttribute = 98  /**< Device supports flushing of outstanding remote writes. */
+	CudaDevAttrHostRegisterSupported                  DeviceAttribute = 99  /**< Device supports host memory registration via ::cudaHostRegister. */
+	CudaDevAttrPageableMemoryAccessUsesHostPageTables DeviceAttribute = 100 /**< Device accesses pageable memory via the host's page tables. */
+	CudaDevAttrDirectManagedMemAccessFromHost         DeviceAttribute = 101 /**< Host can directly access managed memory on the device without migration. */
+	CudaDevAttrMaxBlocksPerMultiprocessor             DeviceAttribute = 106 /**< Maximum number of blocks per multiprocessor */
+	CudaDevAttrMaxPersistingL2CacheSize               DeviceAttribute = 108 /**< Maximum L2 persisting lines capacity setting in bytes. */
+	CudaDevAttrMaxAccessPolicyWindowSize              DeviceAttribute = 109 /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */
+	CudaDevAttrReservedSharedMemoryPerBlock           DeviceAttribute = 111 /**< Shared memory reserved by CUDA driver per block in bytes */
+	CudaDevAttrSparseCudaArraySupported               DeviceAttribute = 112 /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+	CudaDevAttrHostRegisterReadOnlySupported          DeviceAttribute = 113 /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
+	CudaDevAttrTimelineSemaphoreInteropSupported      DeviceAttribute = 114 /**< External timeline semaphore interop is supported on the device */
+	CudaDevAttrMaxTimelineSemaphoreInteropSupported   DeviceAttribute = 114 /**< Deprecated External timeline semaphore interop is supported on the device */
+	CudaDevAttrMemoryPoolsSupported                   DeviceAttribute = 115 /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */
+	CudaDevAttrGPUDirectRDMASupported                 DeviceAttribute = 116 /**< Device supports GPUDirect RDMA APIs like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+	CudaDevAttrGPUDirectRDMAFlushWritesOptions        DeviceAttribute = 117 /**< The returned attribute shall be interpreted as a bitmask where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */
+	CudaDevAttrGPUDirectRDMAWritesOrdering            DeviceAttribute = 118 /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+	CudaDevAttrMemoryPoolSupportedHandleTypes         DeviceAttribute = 119 /**< Handle types supported with mempool based IPC */
+	CudaDevAttrClusterLaunch                          DeviceAttribute = 120 /**< Indicates device supports cluster launch */
+	CudaDevAttrDeferredMappingCudaArraySupported      DeviceAttribute = 121 /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+	CudaDevAttrReserved122                            DeviceAttribute = 122
+	CudaDevAttrReserved123                            DeviceAttribute = 123
+	CudaDevAttrReserved124                            DeviceAttribute = 124
+	CudaDevAttrIpcEventSupport                        DeviceAttribute = 125 /**< Device supports IPC Events. */
+	CudaDevAttrMemSyncDomainCount                     DeviceAttribute = 126 /**< Number of memory synchronization domains the device supports. */
+	CudaDevAttrReserved127                            DeviceAttribute = 127
+	CudaDevAttrReserved128                            DeviceAttribute = 128
+	CudaDevAttrReserved129                            DeviceAttribute = 129
+	CudaDevAttrNumaConfig                             DeviceAttribute = 130 /**< NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum */
+	CudaDevAttrNumaId                                 DeviceAttribute = 131 /**< NUMA node ID of the GPU memory */
+	CudaDevAttrReserved132                            DeviceAttribute = 132
+	CudaDevAttrMpsEnabled                             DeviceAttribute = 133 /**< Contexts created on this device will be shared via MPS */
+	CudaDevAttrHostNumaId                             DeviceAttribute = 134 /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
+	CudaDevAttrMax                                    DeviceAttribute = 135
+)
--- a/wrappers/golang/cuda_runtime/device_context.go
+++ b/wrappers/golang/cuda_runtime/device_context.go
@@ -74,6 +74,14 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
 	return int(cCudaPointerAttributes.device)
 }

+func GetDeviceAttribute(attr DeviceAttribute, device int) int {
+	var res int
+	cRes := (*C.int)(unsafe.Pointer(&res))
+	cDevice := (C.int)(device)
+	C.cudaDeviceGetAttribute(cRes, attr, cDevice)
+	return res
+}
+
 // RunOnDevice forces the provided function to run all GPU related calls within it
 // on the same host thread and therefore the same GPU device.
 //
@@ -84,46 +92,46 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
 //
 // As an example:
 //
-//	   		cr.RunOnDevice(i, func(args ...any) {
-//					 	defer wg.Done()
-//					 	cfg := GetDefaultMSMConfig()
-//					 	stream, _ := cr.CreateStream()
-//					 	for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
-//					 		size := 1 << power
-//
-//					 		// This will always print "Inner goroutine device: 0"
-//							// go func ()  {
-//							// 	device, _ := cr.GetDevice()
-//							// 	fmt.Println("Inner goroutine device: ", device)
-//							// }()
-//					 		// To force the above goroutine to same device as the wrapping function:
-//							// RunOnDevice(i, func(arg ...any) {
-//							// 	device, _ := cr.GetDevice()
-//							// 	fmt.Println("Inner goroutine device: ", device)
-//							// })
-//
-//					 		scalars := GenerateScalars(size)
-//					 		points := GenerateAffinePoints(size)
-//
-//					 		var p Projective
-//					 		var out core.DeviceSlice
-//					 		_, e := out.MallocAsync(p.Size(), p.Size(), stream)
-//					 		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-//					 		cfg.Ctx.Stream = &stream
-//					 		cfg.IsAsync = true
-//
-//					 		e = Msm(scalars, points, &cfg, out)
-//					 		assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
-//
-//					 		outHost := make(core.HostSlice[Projective], 1)
-//
-//					 		cr.SynchronizeStream(&stream)
-//					 		outHost.CopyFromDevice(&out)
-//					 		out.Free()
-//					 		// Check with gnark-crypto
-//					 		assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
-//					 	}
-//					}, i)
+// cr.RunOnDevice(i, func(args ...any) {
+// 		defer wg.Done()
+// 		cfg := GetDefaultMSMConfig()
+// 		stream, _ := cr.CreateStream()
+// 		for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
+// 			size := 1 << power
+
+// 			// This will always print "Inner goroutine device: 0"
+// 			// go func ()  {
+// 			// 	device, _ := cr.GetDevice()
+// 			// 	fmt.Println("Inner goroutine device: ", device)
+// 			// }()
+// 			// To force the above goroutine to same device as the wrapping function:
+// 			// RunOnDevice(i, func(arg ...any) {
+// 			// 	device, _ := cr.GetDevice()
+// 			// 	fmt.Println("Inner goroutine device: ", device)
+// 			// })
+
+// 			scalars := GenerateScalars(size)
+// 			points := GenerateAffinePoints(size)
+
+// 			var p Projective
+// 			var out core.DeviceSlice
+// 			_, e := out.MallocAsync(p.Size(), p.Size(), stream)
+// 			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+// 			cfg.Ctx.Stream = &stream
+// 			cfg.IsAsync = true
+
+// 			e = Msm(scalars, points, &cfg, out)
+// 			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+
+// 			outHost := make(core.HostSlice[Projective], 1)
+
+//			cr.SynchronizeStream(&stream)
+//			outHost.CopyFromDevice(&out)
+//			out.Free()
+//			// Check with gnark-crypto
+//			assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
+//		}
+//	}, i)
 func RunOnDevice(deviceId int, funcToRun func(args ...any), args ...any) {
 	go func(id int) {
 		defer runtime.UnlockOSThread()
--- a/wrappers/golang/cuda_runtime/memory.go
+++ b/wrappers/golang/cuda_runtime/memory.go
@@ -9,6 +9,7 @@ package cuda_runtime
 import "C"

 import (
+	// "runtime"
 	"unsafe"
 )

@@ -58,6 +59,45 @@ func FreeAsync(devicePtr unsafe.Pointer, stream Stream) CudaError {
 	return err
 }

+func AllocPinned(size int, flags AllocPinnedFlags) (unsafe.Pointer, CudaError) {
+	cSize := (C.size_t)(size)
+	var hostPtr unsafe.Pointer
+	ret := C.cudaHostAlloc(&hostPtr, cSize, flags)
+	err := (CudaError)(ret)
+	if err != CudaSuccess {
+		return nil, err
+	}
+
+	return hostPtr, CudaSuccess
+}
+
+func GetHostFlags(ptr unsafe.Pointer) (flag uint) {
+	cFlag := (C.uint)(flag)
+	C.cudaHostGetFlags(&cFlag, ptr)
+	return
+}
+
+func FreeAllocPinned(hostPtr unsafe.Pointer) CudaError {
+	return (CudaError)(C.cudaFreeHost(hostPtr))
+}
+
+func RegisterPinned(hostPtr unsafe.Pointer, size int, flags RegisterPinnedFlags) (unsafe.Pointer, CudaError) {
+	cSize := (C.size_t)(size)
+	// This is required since there are greater values of RegisterPinnedFlags which we do not support currently
+	flags = flags & 3
+	ret := C.cudaHostRegister(hostPtr, cSize, flags)
+	err := (CudaError)(ret)
+	if err != CudaSuccess {
+		return nil, err
+	}
+
+	return hostPtr, CudaSuccess
+}
+
+func FreeRegisteredPinned(hostPtr unsafe.Pointer) CudaError {
+	return (CudaError)(C.cudaHostUnregister(hostPtr))
+}
+
 func CopyFromDevice(hostDst, deviceSrc unsafe.Pointer, size uint) (unsafe.Pointer, CudaError) {
 	cCount := (C.size_t)(size)
 	ret := C.cudaMemcpy(hostDst, deviceSrc, cCount, uint32(CudaMemcpyDeviceToHost))
--- a/wrappers/golang/cuda_runtime/memory_test.go
+++ b/wrappers/golang/cuda_runtime/memory_test.go
@@ -40,3 +40,27 @@ func TestCopyFromToHost(t *testing.T) {
 	assert.Equal(t, CudaSuccess, err, "Couldn't copy to device due to %v", err)
 	assert.Equal(t, someInts, someInts2, "Elements of host slices do not match. Copying from/to host failed")
 }
+
+func TestRegisterUnregisterPinned(t *testing.T) {
+	data := []int{1, 2, 3, 4, 5, 7, 8, 9}
+	dataUnsafe := unsafe.Pointer(&data[0])
+	_, err := RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
+	assert.Equal(t, CudaSuccess, err)
+	_, err = RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
+	assert.Equal(t, CudaErrorHostMemoryAlreadyRegistered, err)
+
+	err = FreeRegisteredPinned(dataUnsafe)
+	assert.Equal(t, CudaSuccess, err)
+	err = FreeRegisteredPinned(dataUnsafe)
+	assert.Equal(t, CudaErrorHostMemoryNotRegistered, err)
+}
+
+func TestAllocFreePinned(t *testing.T) {
+	pinnedMemPointer, err := AllocPinned(int(unsafe.Sizeof(1)*9), CudaHostAllocDefault)
+	assert.Equal(t, CudaSuccess, err)
+
+	err = FreeAllocPinned(pinnedMemPointer)
+	assert.Equal(t, CudaSuccess, err)
+	err = FreeAllocPinned(pinnedMemPointer)
+	assert.Equal(t, CudaErrorInvalidValue, err)
+}
--- a/wrappers/golang/curves/bls12377/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bls12377/tests/g2_msm_test.go
@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {

 	}
 }
+
+func TestMSMG2PinnedHostMemory(t *testing.T) {
+	cfg := g2.G2GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBls12_377.GenerateScalars(size)
+		points := g2.G2GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[g2.G2Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p g2.G2Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[g2.G2Projective], 1)
+
+		e = g2.G2Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMG2GnarkCryptoTypes(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bls12377/tests/msm_test.go
+++ b/wrappers/golang/curves/bls12377/tests/msm_test.go
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {

 	}
 }
+
+func TestMSMPinnedHostMemory(t *testing.T) {
+	cfg := msm.GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBls12_377.GenerateScalars(size)
+		points := icicleBls12_377.GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[icicleBls12_377.Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p icicleBls12_377.Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[icicleBls12_377.Projective], 1)
+
+		e = msm.Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = msm.Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMGnarkCryptoTypes(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bls12377/tests/ntt_test.go
+++ b/wrappers/golang/curves/bls12377/tests/ntt_test.go
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {

 func TestNttBatch(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := bls12_377.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/golang/curves/bls12381/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bls12381/tests/g2_msm_test.go
@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {

 	}
 }
+
+func TestMSMG2PinnedHostMemory(t *testing.T) {
+	cfg := g2.G2GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBls12_381.GenerateScalars(size)
+		points := g2.G2GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[g2.G2Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p g2.G2Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[g2.G2Projective], 1)
+
+		e = g2.G2Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMG2GnarkCryptoTypes(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bls12381/tests/msm_test.go
+++ b/wrappers/golang/curves/bls12381/tests/msm_test.go
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {

 	}
 }
+
+func TestMSMPinnedHostMemory(t *testing.T) {
+	cfg := msm.GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBls12_381.GenerateScalars(size)
+		points := icicleBls12_381.GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[icicleBls12_381.Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p icicleBls12_381.Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[icicleBls12_381.Projective], 1)
+
+		e = msm.Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = msm.Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMGnarkCryptoTypes(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bls12381/tests/ntt_test.go
+++ b/wrappers/golang/curves/bls12381/tests/ntt_test.go
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {

 func TestNttBatch(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := bls12_381.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/golang/curves/bn254/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bn254/tests/g2_msm_test.go
@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {

 	}
 }
+
+func TestMSMG2PinnedHostMemory(t *testing.T) {
+	cfg := g2.G2GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBn254.GenerateScalars(size)
+		points := g2.G2GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[g2.G2Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p g2.G2Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[g2.G2Projective], 1)
+
+		e = g2.G2Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMG2GnarkCryptoTypes(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bn254/tests/msm_test.go
+++ b/wrappers/golang/curves/bn254/tests/msm_test.go
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {

 	}
 }
+
+func TestMSMPinnedHostMemory(t *testing.T) {
+	cfg := msm.GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBn254.GenerateScalars(size)
+		points := icicleBn254.GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[icicleBn254.Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p icicleBn254.Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[icicleBn254.Projective], 1)
+
+		e = msm.Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = msm.Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMGnarkCryptoTypes(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bn254/tests/ntt_test.go
+++ b/wrappers/golang/curves/bn254/tests/ntt_test.go
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {

 func TestNttBatch(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := bn254.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/golang/curves/bw6761/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bw6761/tests/g2_msm_test.go
@@ -106,6 +106,58 @@ func TestMSMG2(t *testing.T) {

 	}
 }
+
+func TestMSMG2PinnedHostMemory(t *testing.T) {
+	cfg := g2.G2GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBw6_761.GenerateScalars(size)
+		points := g2.G2GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[g2.G2Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p g2.G2Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[g2.G2Projective], 1)
+
+		e = g2.G2Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMG2GnarkCryptoTypes(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bw6761/tests/msm_test.go
+++ b/wrappers/golang/curves/bw6761/tests/msm_test.go
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {

 	}
 }
+
+func TestMSMPinnedHostMemory(t *testing.T) {
+	cfg := msm.GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBw6_761.GenerateScalars(size)
+		points := icicleBw6_761.GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[icicleBw6_761.Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p icicleBw6_761.Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[icicleBw6_761.Projective], 1)
+
+		e = msm.Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = msm.Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMGnarkCryptoTypes(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	for _, power := range []int{3} {
--- a/wrappers/golang/curves/bw6761/tests/ntt_test.go
+++ b/wrappers/golang/curves/bw6761/tests/ntt_test.go
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {

 func TestNttBatch(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := bw6_761.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/golang/curves/grumpkin/tests/msm_test.go
+++ b/wrappers/golang/curves/grumpkin/tests/msm_test.go
@@ -40,6 +40,54 @@ func TestMSM(t *testing.T) {
 	}
 }

+func TestMSMPinnedHostMemory(t *testing.T) {
+	cfg := msm.GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleGrumpkin.GenerateScalars(size)
+		points := icicleGrumpkin.GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[icicleGrumpkin.Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p icicleGrumpkin.Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[icicleGrumpkin.Projective], 1)
+
+		e = msm.Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+
+		if pinnableAndLockable {
+			e = msm.Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
+
 func TestMSMBatch(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	for _, power := range []int{10, 16} {
--- a/wrappers/golang/fields/babybear/tests/ntt_no_domain_test.go
+++ b/wrappers/golang/fields/babybear/tests/ntt_no_domain_test.go
@@ -61,11 +61,12 @@ func TestNttDeviceAsyncNoDomain(t *testing.T) {

 func TestNttBatchNoDomain(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := babybear_extension.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/golang/fields/babybear/tests/ntt_test.go
+++ b/wrappers/golang/fields/babybear/tests/ntt_test.go
@@ -80,11 +80,12 @@ func TestNttDeviceAsync(t *testing.T) {

 func TestNttBatch(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := babybear.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/golang/internal/generator/msm/templates/msm_test.go.tmpl
+++ b/wrappers/golang/internal/generator/msm/templates/msm_test.go.tmpl
@@ -185,6 +185,61 @@ func TestMSM{{.CurvePrefix}}(t *testing.T) {
 		{{end}}
 	}
 }
+
+func TestMSM{{.CurvePrefix}}PinnedHostMemory(t *testing.T) {
+	cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicle{{capitalize .Curve}}.GenerateScalars(size)
+		points := {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}GenerateAffinePoints(size)
+		
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective], 1)
+
+		e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+		
+		outHost.CopyFromDevice(&out)
+		{{if ne .GnarkImport "" -}}
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsm{{.CurvePrefix}}(scalars, points, outHost[0]))
+		{{end}}
+		
+		if pinnableAndLockable {
+		e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+			
+			outHost.CopyFromDevice(&out)
+			{{if ne .GnarkImport "" -}}
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsm{{.CurvePrefix}}(scalars, pinnedPoints, outHost[0]))
+			{{end}}
+		}
+		
+		out.Free()
+		
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 {{if ne .GnarkImport "" -}}
 func TestMSM{{if eq .CurvePrefix "G2"}}G2{{end}}GnarkCryptoTypes(t *testing.T) {
 	cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
--- a/wrappers/golang/internal/generator/ntt/templates/ntt_no_domain_test.go.tmpl
+++ b/wrappers/golang/internal/generator/ntt/templates/ntt_no_domain_test.go.tmpl
@@ -64,11 +64,12 @@ func TestNttDeviceAsyncNoDomain(t *testing.T) {

 func TestNttBatchNoDomain(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := {{.FieldNoDomain}}.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/golang/internal/generator/ntt/templates/ntt_test.go.tmpl
+++ b/wrappers/golang/internal/generator/ntt/templates/ntt_test.go.tmpl
@@ -163,11 +163,12 @@ func TestNttDeviceAsync(t *testing.T) {

 func TestNttBatch(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := {{.Field}}.GenerateScalars(1 << largestTestSize * largestBatchSize)

 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize

--- a/wrappers/rust/icicle-cuda-runtime/build.rs
+++ b/wrappers/rust/icicle-cuda-runtime/build.rs
@@ -53,6 +53,8 @@ fn main() {
        .allowlist_function("cudaGetDevice")
        .allowlist_function("cudaSetDevice")
        .allowlist_function("cudaGetDeviceCount")
+        .allowlist_function("cudaDeviceGetAttribute")
+        .rustified_enum("cudaDeviceAttr")
        // error handling
        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
        .allowlist_function("cudaGetLastError")
@@ -79,6 +81,15 @@ fn main() {
        .allowlist_function("cudaDeviceGetDefaultMemPool")
        .allowlist_function("cudaMemGetInfo")
        .rustified_enum("cudaMemcpyKind")
+        .allowlist_function("cudaHostAlloc")
+        .allowlist_var("cudaHostAllocDefault")
+        .allowlist_var("cudaHostAllocPortable")
+        .allowlist_function("cudaFreeHost")
+        .allowlist_function("cudaHostGetFlags")
+        .allowlist_function("cudaHostRegister")
+        .allowlist_var("cudaHostRegisterDefault")
+        .allowlist_var("cudaHostRegisterPortable")
+        .allowlist_function("cudaHostUnregister")
        // Stream Ordered Memory Allocator
        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
        .allowlist_function("cudaFreeAsync")