mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
nocopy (#764)
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
#!/bin/bash -e
|
||||
clang sniff.cc -Werror -shared -fPIC -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so
|
||||
clang sniff.cc -Werror -shared -fPIC -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++
|
||||
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
|
||||
AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
|
||||
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 0 -d 1 -m 1
|
||||
|
||||
26
test/external/external_multi_gpu.py
vendored
26
test/external/external_multi_gpu.py
vendored
@@ -26,18 +26,18 @@ if __name__ == "__main__":
|
||||
CL.synchronize()
|
||||
|
||||
# cross copy. this is going through the CPU
|
||||
with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a1 = a0.to(f'{device}:1').realize()
|
||||
CL.synchronize()
|
||||
with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b0 = b1.to(f'{device}:0').realize()
|
||||
CL.synchronize()
|
||||
|
||||
# sum
|
||||
with Timing("0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab0 = (a0 + b0).realize()
|
||||
CL.synchronize()
|
||||
with Timing("1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab1 = (a1 + b1).realize()
|
||||
CL.synchronize()
|
||||
|
||||
@@ -52,15 +52,19 @@ if __name__ == "__main__":
|
||||
abx1 = (b1 + a0).realize()
|
||||
CL.synchronize()
|
||||
|
||||
# copy back
|
||||
# NOTE: half of this slowness is caused by allocating memory on the CPU
|
||||
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
cc0 = ab0.numpy()
|
||||
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
cc1 = ab1.numpy()
|
||||
|
||||
# same
|
||||
print("testing")
|
||||
np.testing.assert_allclose(cc0, cc1)
|
||||
|
||||
# devices
|
||||
print(ab0)
|
||||
print(ab1)
|
||||
print(abx0)
|
||||
print(abx1)
|
||||
|
||||
# same
|
||||
#print("testing")
|
||||
#np.testing.assert_allclose(ab0.numpy(), ab1.numpy())
|
||||
#np.testing.assert_allclose(ab0.numpy(), abx0.numpy())
|
||||
#np.testing.assert_allclose(ab0.numpy(), abx1.numpy())
|
||||
|
||||
|
||||
@@ -42,9 +42,9 @@ class ImageDType(DType):
|
||||
|
||||
class LazyNumpyArray:
|
||||
def __init__(self, fxn, shape, dtype): self.fxn, self.shape, self.dtype = fxn, shape, dtype
|
||||
def __call__(self) -> np.ndarray: return np.ascontiguousarray(self.fxn(self) if callable(self.fxn) else self.fxn).reshape(self.shape).astype(self.dtype)
|
||||
def __call__(self) -> np.ndarray: return np.require(self.fxn(self) if callable(self.fxn) else self.fxn, dtype=self.dtype, requirements='C').reshape(self.shape)
|
||||
def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype)
|
||||
def copy(self): return self if callable(self.fxn) else LazyNumpyArray(self.fxn.copy(), self.shape, self.dtype)
|
||||
def copy(self): return self if callable(self.fxn) else LazyNumpyArray(self.fxn, self.shape, self.dtype)
|
||||
def astype(self, typ): return LazyNumpyArray(self.fxn, self.shape, typ)
|
||||
|
||||
|
||||
|
||||
@@ -158,7 +158,7 @@ class LazyBuffer:
|
||||
# NOTE: we have to make a copy of the numpy array here in case the user changes it. expose this? LazyNumpyArray doesn't have this problem
|
||||
@staticmethod
|
||||
def fromCPU(x:LazyNumpyArray, device) -> LazyBuffer:
|
||||
return create_lazybuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()), dtypes.from_np(x.dtype))
|
||||
return create_lazybuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x), dtypes.from_np(x.dtype))
|
||||
|
||||
# create a constant with the shape and dtype of self
|
||||
def const_like(self, val) -> LazyBuffer:
|
||||
@@ -169,7 +169,7 @@ class LazyBuffer:
|
||||
def toCPU(self):
|
||||
realized = self.cast(dtypes.from_np(self.dtype.np)).contiguous().realize().realized
|
||||
ret = cast(RawBuffer, realized).toCPU().reshape(self.shape)
|
||||
return ret.copy()
|
||||
return ret
|
||||
|
||||
def cast(self:LazyBuffer, arg:DType) -> LazyBuffer: return elementwise_op(UnaryOps.CAST, self, arg=arg) if self.dtype != arg else self
|
||||
def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
|
||||
|
||||
@@ -27,7 +27,7 @@ def einsum_mulacc(einsum, get_strides, expand):
|
||||
return mulacc
|
||||
|
||||
numpy_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
|
||||
UnaryOps.NOOP: np.ascontiguousarray, UnaryOps.EXP: np.exp, UnaryOps.LOG: np.log, UnaryOps.CAST: lambda x,y: x.astype(y.np),
|
||||
UnaryOps.NOOP: lambda x: np.require(x, requirements='C'), UnaryOps.EXP: np.exp, UnaryOps.LOG: np.log, UnaryOps.CAST: lambda x,y: x.astype(y.np),
|
||||
BinaryOps.MAX: np.maximum, BinaryOps.CMPEQ: lambda x,y: (x==y).astype(np.float32),
|
||||
MovementOps.PERMUTE: lambda x, order: x.transpose(order), MovementOps.PAD: np.pad, MovementOps.EXPAND: np.broadcast_to,
|
||||
MovementOps.STRIDE: lambda x, arg: x[tuple(slice(None, None, i) for i in arg)],
|
||||
|
||||
Reference in New Issue
Block a user