diff --git a/tinygrad/runtime/ops_triton.py b/tinygrad/runtime/ops_triton.py index 495c6f9600..e70794a0d6 100644 --- a/tinygrad/runtime/ops_triton.py +++ b/tinygrad/runtime/ops_triton.py @@ -19,36 +19,6 @@ from tinygrad.runtime.ops_gpu import CLBuffer fn = f"/tmp/{hash}.py" exec(codeObject, globals()) -class TritonBuffer(): - def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[TritonBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False): - super().__init__(shape, hostbuf) - self._buf : Optional[TritonDeviceAllocation] = hostbuf._buf if hostbuf is not None else None - self._base_shape : Tuple[int, ...] = hostbuf._base_shape if hostbuf is not None else self.shape - self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing - if force_create: self.cuda - - @property - def cuda(self): - if self._buf is None: - self._buf = TritonDeviceAllocation(4*prod(self._base_shape)) - if self._backing is not None: self._buf.copyin(self._backing, stream) - return self._buf - - @staticmethod - def fromCPU(x): return TritonBuffer(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel()) - - def toCPU(self): - data = np.empty(self.shape, dtype=np.float32) - buf = self.contiguous() - buf.cuda - buf._buf.copyout(data) - return data - - @classmethod - def exec_ast(cls, ast:LazyOp, output_buffer:Optional[TritonBuffer]=None): - k = TritonASTKernel(ast, output_buffer) - k.codegen()(*k.bufs) - return k.ret class TritonDeviceAllocation(CLBuffer): def __init__(self, size):