mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
100 lines
3.8 KiB
Python
100 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
# GMMU=0 MLX_PCI=0000:41:00.0 PYTHONPATH=. python3 extra/mlx_driver/loopback.py
|
|
import struct
|
|
from tinygrad.helpers import getenv, round_up
|
|
from tinygrad.device import Device, BufferSpec
|
|
from tinygrad.runtime.support.system import PCIDevice
|
|
from tinygrad.runtime.support.memory import AddrSpace
|
|
from tinygrad.runtime.ops_amd import AMDComputeQueue
|
|
from tinygrad.helpers import to_be32, to_be64
|
|
from extra.mlx_driver.mlxdev import MLXDev, MLXQP
|
|
|
|
BUF_SIZE = 0x1000
|
|
MLX_PCI = getenv("MLX_PCI", "0000:41:00.0")
|
|
MLX_IP = getenv("MLX_IP", "10.0.0.1")
|
|
|
|
def map_phys_to_gpu(gpu, paddr, size):
|
|
size = round_up(size, 0x1000)
|
|
va = gpu.iface.dev_impl.mm.alloc_vaddr(size, align=0x1000)
|
|
gpu.iface.dev_impl.mm.map_range(va, size, [(paddr, size)], aspace=AddrSpace.SYS, snooped=True, uncached=True)
|
|
return va
|
|
|
|
print("[init] AMD GPU...")
|
|
gpu = Device["AMD"]
|
|
|
|
print(f"[init] MLX5 at {MLX_PCI}")
|
|
dev = MLXDev(PCIDevice("mlx5", MLX_PCI), ip=MLX_IP)
|
|
qp = MLXQP(dev)
|
|
|
|
print(f"[init] loopback connect QP 0x{qp.qp_info['qpn']:x}")
|
|
qp.connect(qp.qp_info['qpn'], dev.mac, int.from_bytes(dev.local_gid, 'big'))
|
|
|
|
# allocate src/dst via AMD GPU allocator
|
|
buf_src = gpu.allocator.alloc(BUF_SIZE, BufferSpec(nolru=True))
|
|
buf_dst = gpu.allocator.alloc(BUF_SIZE, BufferSpec(nolru=True))
|
|
|
|
bar_base = gpu.iface.pci_dev.bar_info(gpu.iface.vram_bar)[0]
|
|
src_paddr = buf_src.meta.mapping.paddrs[0][0] + bar_base
|
|
dst_paddr = buf_dst.meta.mapping.paddrs[0][0] + bar_base
|
|
print(f"src paddr=0x{src_paddr:x} dst paddr=0x{dst_paddr:x}")
|
|
|
|
# fill src, zero dst
|
|
test_msg = b"Hello from loopback send/recv!"
|
|
gpu.allocator._copyin(buf_src, memoryview(bytearray(test_msg.ljust(BUF_SIZE, b'\x00'))))
|
|
gpu.allocator._copyin(buf_dst, memoryview(bytearray(BUF_SIZE)))
|
|
gpu.synchronize()
|
|
|
|
# post recv WQE on RQ from CPU (scatter entry: byte_count, lkey, addr)
|
|
rq_mask = (1 << 4) - 1 # log_rq_size=4
|
|
rq_wqe = qp.qp_buf.view((qp.rq_head & rq_mask) * 16, 16)
|
|
rq_wqe[:] = struct.pack('>IIQ', len(test_msg), dev.mkey, dst_paddr)
|
|
qp.rq_head += 1
|
|
# ring recv doorbell from CPU (DBR offset 0 = recv counter)
|
|
dev.dbr[qp.qp_dbr // 4] = to_be32(qp.rq_head)
|
|
|
|
# build send WQE in SQ from CPU (opcode 0x0a = SEND, ds_count=2)
|
|
sq_head = qp.sq_head
|
|
sq_mask = (1 << qp.log_sq_size) - 1
|
|
wqe = qp.qp_buf.view(qp.sq_offset + (sq_head & sq_mask) * 64, 64)
|
|
wqe[:] = bytes(64)
|
|
wqe[0:8] = struct.pack('>II', (sq_head << 8) | 0x0a, (qp.qp_info['qpn'] << 8) | 2)
|
|
wqe[11] = 0x08 # CE: signal completion
|
|
wqe[16:32] = struct.pack('>IIQ', len(test_msg), dev.mkey, src_paddr)
|
|
qp.sq_head += 1
|
|
doorbell_val = to_be64(int.from_bytes(bytes(wqe[0:8]), 'big'))
|
|
|
|
# map MLX5 UAR and DBR into GPU VA
|
|
uar_paddr = dev.pci_dev.bar_info(0)[0] + dev.uar * 0x1000
|
|
uar_gpu_va = map_phys_to_gpu(gpu, uar_paddr, 0x1000)
|
|
dbr_gpu_va = map_phys_to_gpu(gpu, dev.dbr_paddrs[0], 0x1000)
|
|
print(f"UAR gpu_va=0x{uar_gpu_va:x} DBR gpu_va=0x{dbr_gpu_va:x}")
|
|
|
|
# GPU rings send doorbell via compute queue release_mem
|
|
q = AMDComputeQueue(gpu)
|
|
q.wait(gpu.timeline_signal, gpu.timeline_value - 1)
|
|
# write DBR (32-bit sq_head) - send doorbell at qp_dbr + 4
|
|
q.release_mem(dbr_gpu_va + qp.qp_dbr + 4, to_be32(qp.sq_head), q.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
|
q.pm4.int_sel__mec_release_mem__none)
|
|
# write UAR doorbell (64-bit)
|
|
q.release_mem(uar_gpu_va + 0x800, doorbell_val, q.pm4.data_sel__mec_release_mem__send_64_bit_data,
|
|
q.pm4.int_sel__mec_release_mem__none)
|
|
q.signal(gpu.timeline_signal, gpu.next_timeline())
|
|
q.submit(gpu)
|
|
|
|
print("GPU kicked doorbell, waiting...")
|
|
gpu.synchronize()
|
|
|
|
# poll CQ from CPU (send + recv completions)
|
|
qp.poll_cq()
|
|
qp.poll_cq()
|
|
|
|
# read back
|
|
result = bytearray(BUF_SIZE)
|
|
gpu.allocator._copyout(memoryview(result), buf_dst)
|
|
gpu.synchronize()
|
|
|
|
got = bytes(result[:len(test_msg)])
|
|
print(f"result: {got}")
|
|
assert got == test_msg, f"MISMATCH: {got} != {test_msg}"
|
|
print("RDMA loopback send/recv test passed (GPU-kicked)")
|