multi cl_queue (#762)

* multi cl_queue

* only platforms 1

* gpus first, then cpus

* put device on underlying buffer

* cl_queue array
This commit is contained in:
George Hotz
2023-05-03 12:15:28 -07:00
committed by GitHub
parent 7757f5fed2
commit 7ecf4dff68
4 changed files with 28 additions and 25 deletions

View File

@@ -10,16 +10,16 @@ prg = CLProgram("test", """__kernel void test(__global float *a, __global float
int idx = get_global_id(0);
a[idx] = b[idx] + c[idx];
}""")
prg.clprg(CL.cl_queue, [N,], None, a._cl, b._cl, c._cl)
prg.clprg(CL.cl_queue[0], [N,], None, a._cl, b._cl, c._cl)
t1 = time.monotonic_ns()
e1 = prg.clprg(CL.cl_queue, [N,], None, a._cl, b._cl, c._cl)
CL.cl_queue.finish() # type: ignore
e1 = prg.clprg(CL.cl_queue[0], [N,], None, a._cl, b._cl, c._cl)
CL.synchronize()
t2 = time.monotonic_ns()
time.sleep(3)
t3 = time.monotonic_ns()
e2 = prg.clprg(CL.cl_queue, [N,], None, a._cl, b._cl, c._cl)
CL.cl_queue.finish() # type: ignore
e2 = prg.clprg(CL.cl_queue[0], [N,], None, a._cl, b._cl, c._cl)
CL.synchronize()
t4 = time.monotonic_ns()
print(e1.profile.queued)