diff --git a/tinygrad/codegen/assembly.py b/tinygrad/codegen/assembly.py index 666a297343..d8dac5ab9b 100644 --- a/tinygrad/codegen/assembly.py +++ b/tinygrad/codegen/assembly.py @@ -80,7 +80,7 @@ class AssemblyLanguage: off = 0 # TODO: should this be None? if isinstance(idx, SumNode): nums = [n.b for n in idx.nodes if isinstance(n, NumNode)] - if len(nums) > 0 and nums[0] < 4096 and (idx-nums[0]).min >= 0: # TODO: different for each GPU? + if nums and nums[0] < 4096 and (idx-nums[0]).min >= 0: # TODO: different for each GPU? idx -= nums[0] off = cast(int, nums[0]) reg = idx.render(self.render_ops, self) diff --git a/tinygrad/codegen/assembly_arm64.py b/tinygrad/codegen/assembly_arm64.py index 9faa21cfdf..fe94d449bf 100644 --- a/tinygrad/codegen/assembly_arm64.py +++ b/tinygrad/codegen/assembly_arm64.py @@ -55,7 +55,7 @@ def specialize_to_arm64(fn_nm, asm): for v in [v for v in mvars if v is not None and v.__class__ is not int and v.nm not in rtor]: available_regs = s_regs if dtypes.is_float(v[1]) else x_regs #NOTE: Very simple spill, everything that don't fit in regs goes to mem - if len(available_regs) == 0: + if not available_regs: # ARM needs the stack 16-byte aligned var_size += 16 available_regs.append('s0' if dtypes.is_float(out[1]) else 'x12') diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 09687ff8c8..ae3ad3ebb3 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -190,7 +190,7 @@ class Linearizer: # make the output buffer shape correct in here self.sts[0].reshape(self.info.shape) - self.full_buf_index: int = self.bufs.index(self.earlybufs[0]) if len(self.earlybufs) > 0 else 0 + self.full_buf_index: int = self.bufs.index(self.earlybufs[0]) if self.earlybufs else 0 # move all reduce axes to the end reduce = list(enumerate(zip(self.full_shape, self.sts[0].shape))) @@ -314,7 +314,7 @@ class Linearizer: self.uop(UOps.DEFINE_GLOBAL, None, [], (var.expr, dtypes._arg_int32)) # add a local buffer for multistage reduce - if len(self.group_for_reduce): + if self.group_for_reduce: # TODO: the strides of this can be controlled self.sts.append(ShapeTracker(tuple([1] * self.first_reduce + self.group_for_reduce + [1] * (self.shape_len - self.upcasted - len(self.group_for_reduce) - self.first_reduce) + [x[0] for x in self.upcasted_axis(0)]))) self.bufs.append(LocalBuffer("temp", self.sts[-1].size())) diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py index 218ed256ce..fa5ebb8738 100644 --- a/tinygrad/codegen/optimizer.py +++ b/tinygrad/codegen/optimizer.py @@ -49,7 +49,7 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], to_p opts.append(ng.p.TransitionChoice([(i,s,"L") for s in LOCALS if k.full_shape[i]%s == 0])) for i in range(k.shape_len-k.first_reduce): opts.append(ng.p.TransitionChoice([(i,s,"R") for s in UPCASTS if k.full_shape[k.first_reduce+i]%s == 0])) - if len(opts) == 0: return "BASELINE" + if not opts: return "BASELINE" search_space = prod([len(x.choices) for x in opts]) st = time.perf_counter() optimizer = ng.optimizers.NGOpt(parametrization=ng.p.Tuple(*opts), budget=min(search_space, 200)) @@ -118,7 +118,7 @@ def hand_coded_optimizations(k:Linearizer): buf1_strides = k.sts[buf1].real_strides() axis_buf0 = [(i,k.full_shape[i],buf1_strides[i]) for i,s in enumerate(buf0_strides) if s == 0 and k.full_shape[i]%16 == 0 and i < k.first_reduce] axis_buf1 = [(i,k.full_shape[i],buf0_strides[i]) for i,s in enumerate(buf1_strides) if s == 0 and k.full_shape[i]%16 == 0 and i < k.first_reduce] - if len(axis_buf0) and len(axis_buf1) and k.full_shape[k.first_reduce]%8 == 0 and (k.shape_len-k.first_reduce) == 1: + if axis_buf0 and axis_buf1 and k.full_shape[k.first_reduce]%8 == 0 and (k.shape_len-k.first_reduce) == 1: if DEBUG >= 3: print("HIP TENSOR CORES", axis_buf0, axis_buf1) k.use_tensor_cores = getenv("TC", 1) == 1 # TC=2 will do the shape ops without the WMMA k.reverse_upcast_dir = True @@ -177,7 +177,7 @@ def hand_coded_optimizations(k:Linearizer): buf1_strides = k.sts[buf1].real_strides() axis_buf0 = [(i,k.full_shape[i],buf1_strides[i]) for i,s in enumerate(buf0_strides) if s == 0 and k.full_shape[i]%8 == 0 and i < k.first_reduce] axis_buf1 = [(i,k.full_shape[i],buf0_strides[i]) for i,s in enumerate(buf1_strides) if s == 0 and k.full_shape[i]%8 == 0 and i < k.first_reduce] - if len(axis_buf0) and len(axis_buf1) and k.full_shape[k.first_reduce]%8 == 0 and (k.shape_len-k.first_reduce) == 1: + if axis_buf0 and axis_buf1 and k.full_shape[k.first_reduce]%8 == 0 and (k.shape_len-k.first_reduce) == 1: if DEBUG >= 3: print("METAL TENSOR CORES", axis_buf0, axis_buf1) k.use_tensor_cores = getenv("TC", 1) == 1 # TC=2 will do the shape ops without the WMMA @@ -267,7 +267,7 @@ def hand_coded_optimizations(k:Linearizer): # if we haven't upcasted it, it's not symbolic, it mods, and some buffer has stride 0 on axis while having no stride 0 in the upcasted axis already if axis not in upcasted_axis and isinstance(k.full_shape[axis], int) and k.full_shape[axis]%upcast_amount == 0 and any(k.sts[buf_index].views[-1].strides[axis] == 0 and not any(x[1] == 0 for x in k.upcasted_axis(buf_index)) for buf_index in range(len(k.sts))): xb_choices.append((sum(st.views[-1].strides[axis]>0 for st in k.sts), sum(st.views[-1].strides[axis] for st in k.sts), axis, upcast_amount)) - if len(xb_choices): + if xb_choices: xb_choices = sorted(xb_choices) if DEBUG >= 4: print(f"float4 merging axis : {xb_choices}") k.shift_to(xb_choices[0][2], amount=xb_choices[0][3]) @@ -293,7 +293,7 @@ def hand_coded_optimizations(k:Linearizer): # if nothing at all is upcasted and it's easy to, do an upcast # TODO: this is breaking the tests for splits in [4]: - if k.upcasted == 0 and len(k.full_unupcasted_shape) > 0 and k.full_unupcasted_shape[-1] % splits == 0: + if k.upcasted == 0 and k.full_unupcasted_shape and k.full_unupcasted_shape[-1] % splits == 0: k.shift_to(len(k.full_unupcasted_shape)-1, splits, insert_before=len(k.full_unupcasted_shape)) k.upcast() diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index ab3f6213d6..68c25b64c4 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -69,7 +69,7 @@ def _ast_binaryops(self:LazyBuffer) -> LazyOp: # TODO: this can also support late fusion of BinaryOps, required for test_fold_conv_sgd psrcs: List[Tuple[LazyBuffer, LazyBuffer]] = [(k,x) for k,x in zip(real_srcs.keys(), map(get_movementroot_contiguous, real_srcs.keys())) if x.optype == ReduceOps and not x.realized and prod(k.shape) == prod(x.shape) and len(x.children) <= 1 and len(k.children) <= 1] intermediate_shape: Tuple[int, ...] = self.shape - if MERGE_ONE_REDUCE_INTO_ELEMENTWISE and len(psrcs) >= 1: + if MERGE_ONE_REDUCE_INTO_ELEMENTWISE and psrcs: psrc = psrcs[0] # NOTE: right now we can't handle multiple, as we'd have to check for loop if psrc[1].optype == ReduceOps: top = _ast_reduceops(psrc[1]) @@ -214,7 +214,7 @@ class LazyBuffer: return create_lazybuffer(self.device, ShapeTracker(self.shape), LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,), None), self.dtype) def shuffle_and_prune_movement_ops(self, st: ShapeTracker, op: MovementOps, arg: Union[Tuple[Union[Node,int], ...], Tuple[Tuple[int, int], ...]]) -> LazyBuffer: - if SHUFFLE_MOVEMENT_OPS and self.optype == BinaryOps and not self.realized and (op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and self.op.op in UnaryOps)) and len(self.children) == 0: + if SHUFFLE_MOVEMENT_OPS and self.optype == BinaryOps and not self.realized and (op in {MovementOps.SHRINK, MovementOps.STRIDE, MovementOps.PERMUTE} or (op == MovementOps.RESHAPE and self.op.op in UnaryOps)) and not self.children: return self.op.replace_with_movement_ops([(op, arg)]) ret = create_lazybuffer(self.device, st, MovementOps, LazyOp(op, (self,), arg), self.dtype) if REMOVE_MOVEMENT_NOPS and not self.realized and not ret.realized and ret.st.contiguous: @@ -335,7 +335,7 @@ def elementwise_op(op:Union[UnaryOps, BinaryOps, TernaryOps], *srcs:LazyBuffer, if MERGE_ELEMENTWISE_OPS: # remove the buffers from any (childless) BinaryOps that feed into this - srcs = tuple([x.op if x.optype == BinaryOps and len(x.children) == 0 and not x.realized else x for x in srcs]) # type: ignore + srcs = tuple([x.op if x.optype == BinaryOps and not x.children and not x.realized else x for x in srcs]) # type: ignore return create_lazybuffer(out_device, ShapeTracker(out_shape), BinaryOps, LazyOp(op, srcs, arg), out_dtype) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index a0a7499b27..f7ba3338dd 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -136,7 +136,7 @@ def uops_to_cstyle(lang:CStyleLanguage, function_name:str, uops:List[UOp]) -> T elif uop == UOps.BARRIER: kk(lang.barrier) elif uop == UOps.ENDLOOP: - if args[1] == "local" and len(lang.lid): + if args[1] == "local" and lang.lid: # TODO: this is a bit of a hack. the local loop isn't real on the GPU kk(f"if ({Variable.sum(args[0]).render(render_cl)} == 0) {{") pend_close = "}"*(len(args[0])+1) + f" /* {args[1]} */" diff --git a/tinygrad/renderer/wgsl.py b/tinygrad/renderer/wgsl.py index 84a2261d1c..c5d7380bf6 100644 --- a/tinygrad/renderer/wgsl.py +++ b/tinygrad/renderer/wgsl.py @@ -31,12 +31,12 @@ class WGSLLanguage(CStyleLanguage): return self.render_cast([val]*var_dtype.sz, var_dtype) if var_dtype.sz > 1 else val def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,DType]], global_size:List[int], local_size:List[int], prekernel:List[str]) -> Tuple[str, List[int], List[int]]: - local_size = local_size[::-1] if len(local_size) else [1] + local_size = local_size[::-1] if local_size else [1] bind_it = iter(range(len(bufs))) prg = "fn nan() -> f32 { let bits = 0xffffffffu; return bitcast(bits); }\n" prg += "\n".join(prekernel+[f"@group(0) @binding({next(bind_it)}) var {name}: array<{type_map[dtype]}>;" for name,dtype in bufs]) prg += f"\n@compute @workgroup_size({','.join([str(x) for x in local_size])}) fn {function_name}(@builtin(workgroup_id) gindex: vec3, @builtin(local_invocation_id) lindex: vec3) {{\n" + "\n".join(kernel) + "\n}" - return prg, global_size[::-1] if len(global_size) else [1], local_size + return prg, global_size[::-1] if global_size else [1], local_size def render_for(self, expr:str, _min:int, _max:Union[int,str]) -> str: return f"for(var {expr} = {_min}; {expr} <= {_max}; {expr}++) {{" diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 9182d6f735..6cfa532f68 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -32,7 +32,7 @@ class CLAllocator(LRUAllocator): class _CL: def __init__(self): cl_platforms = cl.get_platforms() - platform_devices: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl_platforms] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl_platforms]) if len(y)] + platform_devices: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl_platforms] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl_platforms]) if y] self.devices = [device for device in platform_devices[getenv('CL_PLATFORM', 0)] if device.name not in getenv('CL_EXCLUDE', "").split(",")] self.cl_platform = self.devices[0].platform def post_init(self, device=None): diff --git a/tinygrad/shape/shapetracker.py b/tinygrad/shape/shapetracker.py index eba8f84511..dfd66dc941 100644 --- a/tinygrad/shape/shapetracker.py +++ b/tinygrad/shape/shapetracker.py @@ -8,7 +8,7 @@ from tinygrad.shape.symbolic import Variable, MulNode, NumNode, Node, SumNode, i @functools.lru_cache(maxsize=None) def to_shape_strides(shape:Tuple[int, ...], strides:Tuple[int, ...]) -> Tuple[Tuple[int, int], ...]: assert len(shape) == len(strides) - ret = [(shape[0], strides[0])] if len(shape) > 0 else [] + ret = [(shape[0], strides[0])] if shape else [] for i in range(1, len(shape)): if ret[-1][1] == shape[i]*strides[i] or ret[-1][0] == 1: ret[-1] = (ret[-1][0] * shape[i], strides[i]) diff --git a/tinygrad/shape/symbolic.py b/tinygrad/shape/symbolic.py index d42e70611d..511990be83 100644 --- a/tinygrad/shape/symbolic.py +++ b/tinygrad/shape/symbolic.py @@ -42,7 +42,7 @@ class Node: lhs = self if isinstance(lhs, SumNode) and isinstance(b, int): muls, others = partition(lhs.nodes, lambda x: isinstance(x, MulNode) and x.b > 0 and x.max >= b) - if len(muls): + if muls: # NOTE: gcd in python 3.8 takes exactly 2 args mul_gcd = muls[0].b for x in muls[1:]: mul_gcd = gcd(mul_gcd, x.b) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 5e3824eaf1..ae77ebc1f9 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -282,7 +282,7 @@ class Tensor: raise IndexError(f"too many indices for tensor of dimension {len(self.shape)}") ellipses_found = [i for i, v in enumerate(orig_slices) if v is Ellipsis] if len(ellipses_found) > 1: raise IndexError("an index can only have a single ellipsis ('...')") - ellipsis_idx = len(orig_slices) if len(ellipses_found) == 0 else ellipses_found[0] + ellipsis_idx = ellipses_found[0] if ellipses_found else len(orig_slices) orig_slices[ellipsis_idx:ellipsis_idx+1] = [slice(None)] * (len(self.shape) - num_slices) tensor_found = [(i,v) for i, v in enumerate(orig_slices) if isinstance(v, Tensor)]