new style device (#2530)

* cpu tests pass

* torch works

* works

* metal works

* fix ops_disk

* metal jit works

* fix openpilot

* llvm and clang work

* fix webgpu

* docs are rly broken

* LRU works on metal

* delete comment

* revert name to ._buf. LRU only on Compiled

* changes

* allocator

* allocator, getting closer

* lru alloc

* LRUAllocator

* all pass

* metal

* cuda

* test examples

* linearizer

* test fixes

* fix custom + clean realize

* fix hip

* skip tests

* fix tests

* fix size=0

* fix MOCKHIP

* fix thneed

* copy better

* simple

* old style metal copy

* fix thneed

* np reshape

* give cuda a device
This commit is contained in:
George Hotz
2023-11-30 17:07:16 -08:00
committed by GitHub
parent e56511b59a
commit 2c363b5f0b
38 changed files with 572 additions and 1039 deletions

View File

@@ -85,7 +85,6 @@ def schedule_to_thneed(schedule, output_fn):
def thneed_test_onnx(onnx_data, output_fn):
import onnx
import pyopencl as cl
from tinygrad.runtime.ops_gpu import CL
import numpy as np
from extra.thneed import Thneed
onnx_model = onnx.load(io.BytesIO(onnx_data))
@@ -118,11 +117,11 @@ def thneed_test_onnx(onnx_data, output_fn):
# inputs
for k,v in nt.inputs.items():
cl.enqueue_copy(CL.cl_queue[0], v, new_np_inputs[k], is_blocking=True)
cl.enqueue_copy(Device["GPU"].queue, v, new_np_inputs[k], is_blocking=True)
nt.run()
new_thneed_out = np.empty((nt.outputs[0].size//4,), dtype=np.float32).reshape(new_torch_out.shape)
cl.enqueue_copy(CL.cl_queue[0], new_thneed_out, nt.outputs[0], is_blocking=True)
cl.enqueue_copy(Device["GPU"].queue, new_thneed_out, nt.outputs[0], is_blocking=True)
# compare torch to thneed
np.testing.assert_allclose(new_torch_out, new_thneed_out, atol=1e-4, rtol=1e-2)