native_explog is default

This commit is contained in:
George Hotz
2023-01-24 08:09:43 -08:00
parent 5d350d4883
commit 78fedc13d1

View File

@@ -12,7 +12,7 @@ from tinygrad.shape import ShapeTracker, View, ZeroView
from tinygrad.shape.symbolic import Variable, ModNode
VALIDHACKS = int(os.getenv("VALIDHACKS", "0")) # TODO: remove the need for this
NATIVE_EXPLOG = int(os.getenv("NATIVE_EXPLOG", 0)) # this is needed as a switch for the tests to pass
NATIVE_EXPLOG = int(os.getenv("NATIVE_EXPLOG", "1")) # this is needed as a switch for the tests to pass on QCOM
CLCACHE = int(os.getenv("CLCACHE", "1"))
FLOAT16 = int(os.getenv("FLOAT16", "0"))
@@ -50,22 +50,18 @@ class CL:
cl_ctx : Optional[cl.Context] = None
cl_queue : Optional[cl.CommandQueue] = None
def __init__(self):
if CL.cl_queue is not None: # already initted
return
if CL.cl_queue is not None: return # already initted
devices = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
if len(devices) == 0: # settle for CPU
devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], [])
CL.cl_ctx = cl.Context(devices=[devices[int(os.getenv("CL_DEVICE", "0"))]])
if len(devices) > 1 or DEBUG >= 1:
print(f"using {CL.cl_ctx.devices}")
if len(devices) > 1 or DEBUG >= 1: print(f"using {CL.cl_ctx.devices}")
CL.cl_queue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue
@staticmethod
def enqueue_copy(a, b, is_blocking=False):
if CL.CACHE is not None:
assert False, f"can't copy {a} -> {b} while caching"
if DEBUG >= 1:
print(f"**CL** copy in {b.shape}" if isinstance(b, np.ndarray) else f"**CL** copy OUT {a.shape}")
if CL.CACHE is not None: assert False, f"can't copy {a} -> {b} while caching"
if DEBUG >= 1: print(f"**CL** copy in {b.shape}" if isinstance(b, np.ndarray) else f"**CL** copy OUT {a.shape}")
cl.enqueue_copy(CL().cl_queue, a, b, is_blocking=is_blocking)
@functools.lru_cache(maxsize=None)
@@ -85,14 +81,10 @@ class CLProgram:
CLProgram.kernel_cnt[name] += 1
def __call__(self, *args):
CL.kernel_count += 1
if CL.CACHE is not None:
CL.CACHE.append((self, args))
else:
e = self.clprg(CL().cl_queue, *args)
if DEBUG >= 4:
print(self.prg)
if DEBUG >= 2:
CL.cl_queue.finish()
if CL.CACHE is not None: CL.CACHE.append((self, args))
else: e = self.clprg(CL().cl_queue, *args)
if DEBUG >= 4: print(self.prg)
if DEBUG >= 2: CL.cl_queue.finish()
if DEBUG >= 1:
CL.time_sum += 0 if DEBUG <= 1 or CL.CACHE is not None else (e.profile.end - e.profile.start)
CL.ops_sum += self.op_estimate
@@ -122,9 +114,6 @@ class CLASTKernel(ASTKernel):
}
start_for_op = {ReduceOps.SUM: "0.0", ReduceOps.MAX: "-INFINITY"}
def __init__(self, ast:LazyOp):
super().__init__(ast)
# TODO: move to shapetracker
def compute_buf_index_symbolic(self, st, buf_index, offset=0):
view = View(self.shapes[buf_index], self.strides[buf_index], self.offsets[buf_index] + offset)
@@ -388,6 +377,7 @@ class CLASTKernel(ASTKernel):
clbufs = [x.cl for i,x in enumerate(bufs) if i not in self.bufs_to_delete]
return self.fxn(self.output_shape[::-1] if len(self.output_shape) > 0 else [1], (self.group_for_reduce[::-1] + [1]*(len(self.output_shape)-len(self.group_for_reduce))) if self.group_for_reduce else None, *clbufs)
return runner
def print(self):
super().print()
for i in range(len(self.bufs)):