diff --git a/examples/test_onnx_imagenet.py b/examples/test_onnx_imagenet.py index 20a681d258..af125f84d2 100644 --- a/examples/test_onnx_imagenet.py +++ b/examples/test_onnx_imagenet.py @@ -58,7 +58,7 @@ if __name__ == "__main__": return None return {"input": img.numpy()} quantize_static(model_fp32, fn, ImagenetReader(), quant_format=QuantFormat.QDQ, per_channel=False, - activation_type=QuantType.QUInt8, weight_type=QuantType.QInt8, + activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={"ActivationSymmetric": False}) run_onnx_jit, input_specs = load_onnx_model(fetch(fn)) @@ -74,5 +74,8 @@ if __name__ == "__main__": hit += y==t print(f"target: {y:3d} pred: {t:3d} acc: {hit/(i+1)*100:.2f}%") + MS_TARGET = 13.4 + print(f"need {GlobalCounters.global_ops/1e9*(1000/MS_TARGET):.2f} GFLOPS for {MS_TARGET:.2f} ms") + import pickle with open("/tmp/im.pkl", "wb") as f: pickle.dump(run_onnx_jit, f) diff --git a/extra/onnx.py b/extra/onnx.py index dc723ddfe1..d340b00bf4 100644 --- a/extra/onnx.py +++ b/extra/onnx.py @@ -744,6 +744,10 @@ def get_onnx_ops(): return y, scale, zero_point def DequantizeLinear(x:Tensor, x_scale:Tensor, x_zero_point:Tensor|int=0, axis:int=1, block_size:int=0): + WEIGHT_SHIFT = 4 + if getenv("NHWC") and len(x.shape) == 4 and x.shape[2:] == (1,1) and x.shape[1]%WEIGHT_SHIFT == 0: + # DSP swizzle memory + x = x.reshape(x.shape[0], x.shape[1]//WEIGHT_SHIFT, WEIGHT_SHIFT).permute(1,0,2).contiguous().permute(1,0,2).reshape(x.shape) x_scale, x_zero_point = _prepare_quantize(x, x_scale, x_zero_point, axis, block_size) return ((x.int() - x_zero_point) * x_scale).cast(x_scale.dtype) diff --git a/extra/replay_pkl.py b/extra/replay_pkl.py index e550585f87..abb8a1a33b 100644 --- a/extra/replay_pkl.py +++ b/extra/replay_pkl.py @@ -2,7 +2,7 @@ import pickle, sys from dataclasses import replace from tinygrad import Device, Context from tinygrad.device import Buffer -from tinygrad.helpers import getenv +from tinygrad.helpers import getenv, BEAM from tinygrad.engine.jit import TinyJit from tinygrad.engine.realize import CompiledRunner from tinygrad.renderer import ProgramSpec @@ -22,7 +22,11 @@ if __name__ == "__main__": if knum == (pknum:=getenv("KNUM", 0)) or pknum == 0: p: ProgramSpec = ei.prg.p k = Kernel(p.ast, Device["DSP"].renderer) - if not getenv("NOOPT"): + dsp_bufs = [Buffer("DSP", 1024+b.size*2, b.dtype).view(b.size, b.dtype, 512) for b in ei.bufs] + if BEAM: + from tinygrad.engine.search import beam_search + k = beam_search(k, dsp_bufs, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1))) + elif not getenv("NOOPT"): # only NCHW """ if knum in [6,7,9,11]: @@ -48,19 +52,102 @@ if __name__ == "__main__": elif knum == 3: k.apply_opt(Opt(op=OptOps.UNROLL, axis=0, arg=4)) k.apply_opt(Opt(OptOps.UPCAST, 1, 128)) + elif knum == 29: + #k.apply_opt(Opt(OptOps.UPCAST, 1, 2)) + k.apply_opt(Opt(OptOps.PADTO, 1, 128)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 256)) + #k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) else: k.hand_coded_optimizations() """ + """ if knum == 3: + # 12544x32 * 32x16 -> 12544x16 + k.apply_opt(Opt(OptOps.UNROLL, 0, 0)) k.apply_opt(Opt(OptOps.UPCAST, 1, 16)) k.apply_opt(Opt(OptOps.UPCAST, 0, 128//16)) + #k.apply_opt(Opt(OptOps.UPCAST, 0, 256//16)) #k.apply_opt(Opt(OptOps.UPCAST, 0, 8)) pass + elif knum == 6: + k.apply_opt(Opt(OptOps.UNROLL, 0, 8)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 0)) + elif knum == 4: + # 12544x16 * 16x96 -> 12544x96 + # (with the biased add) + #k.apply_opt(Opt(OptOps.UPCAST, 1, 96)) + #k.apply_opt(Opt(OptOps.UPCAST, 0, 4)) + #k.apply_opt(Opt(OptOps.UNROLL, 0, 0)) + #k.apply_opt(Opt(OptOps.PADTO, 0, 3)) + pass + elif knum == 13: + # 784x144 * 144x32 -> 784x32 + #k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) + #k.apply_opt(Opt(OptOps.UNROLL, 0, 2)) + #k.apply_opt(Opt(OptOps.UPCAST, 0, 4)) + #k.apply_opt(Opt(OptOps.UPCAST, 0, 2)) + #k.apply_opt(Opt(OptOps.UPCAST, 1, 32)) + pass + elif knum == 20: + # 784x192 * 192x32 -> 784x32 + k.apply_opt(Opt(OptOps.UNROLL, 0, 8)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 32)) + k.apply_opt(Opt(OptOps.UPCAST, 0, 4)) + elif knum == 35: + k.apply_opt(Opt(OptOps.UNROLL, 0, 128)) + k.apply_opt(Opt(OptOps.UPCAST, 0, 2)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 64)) + elif knum == 37: + pass + elif knum == 24: + #k.apply_opt(Opt(OptOps.UNROLL, 0, 0)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 64)) + #k.apply_opt(Opt(OptOps.UPCAST, 0, 2)) + """ + #if knum in [7, 11, 14, 18]: + # alignment issue? + #pass + if knum == 4: + k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 96)) + k.apply_opt(Opt(OptOps.UPCAST, 0, 4)) + elif knum == 6: + k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 24)) + k.apply_opt(Opt(OptOps.UPCAST, 0, 16)) + elif knum == 11: + k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 144)) + #k.apply_opt(Opt(OptOps.UPCAST, 0, 8)) + elif knum == 14: + k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 192)) + k.apply_opt(Opt(OptOps.UPCAST, 0, 2)) + elif knum == 37: + k.apply_opt(Opt(OptOps.UNROLL, 0, 4)) + k.apply_opt(Opt(OptOps.UPCAST, 1, 384)) else: - k.hand_coded_optimizations() + full_shape = k.full_shape + out_shape = k.sts[0].shape + out_strides = k.sts[0].real_strides() + if len(out_strides) == 3: + if full_shape[1] < 128: + if full_shape[2] <= 32: k.apply_opt(Opt(OptOps.UNROLL, 0, 0)) + else: k.apply_opt(Opt(OptOps.UNROLL, 0, 8)) + k.apply_opt(Opt(OptOps.UPCAST, 1, full_shape[1])) + if out_strides[0] < 128: + upcast_0 = 128//out_strides[0] + if out_shape[0]%upcast_0 == 0 and upcast_0 != 1: k.apply_opt(Opt(OptOps.UPCAST, 0, upcast_0)) + elif full_shape[1] % 128 == 0: + k.apply_opt(Opt(OptOps.UPCAST, 1, 128)) + elif len(out_strides) == 1: + #if full_shape[0]%128 == 0: k.apply_opt(Opt(OptOps.UPCAST, 0, 128)) + pass + #print("here", out_shape, out_strides, k.name) + #k.hand_coded_optimizations() #if knum in [5]: k.apply_opt(Opt(OptOps.UPCAST, 1, 2)) p2 = k.to_program() - new_ei = replace(ei, prg=CompiledRunner(p2), bufs=[Buffer("DSP", 1024+b.size*2, b.dtype).view(b.size, b.dtype, 512) for b in ei.bufs]) + new_ei = replace(ei, prg=CompiledRunner(p2), bufs=dsp_bufs) new_ei.run() knum += 1 diff --git a/test/test_quantize_onnx.py b/test/test_quantize_onnx.py index e1bce65950..8e062a11af 100644 --- a/test/test_quantize_onnx.py +++ b/test/test_quantize_onnx.py @@ -1,10 +1,13 @@ +# ruff: noqa: E501 import numpy as np import unittest from dataclasses import replace from tinygrad import Tensor, Context, Device, dtypes -from tinygrad.ops import Ops +from tinygrad.ops import Ops, UOp # noqa: F401 # pylint: disable=unused-import from tinygrad.codegen.kernel import Kernel, Opt, OptOps from tinygrad.engine.realize import CompiledRunner, ExecItem, lower_schedule_item +from tinygrad.engine.search import bufs_from_lin +from tinygrad.shape.shapetracker import ShapeTracker, View # noqa: F401 # pylint: disable=unused-import N = 512 @@ -40,7 +43,7 @@ def sexec(out:Tensor, opts:list[Opt], replace_src=None, run_count=3): for opt in opts: k.apply_opt(opt) prg = k.to_program() if replace_src is not None: - old_name = prg.src.split("inscount();\n")[1].split("(")[0] + old_name = prg.src.split("__attribute__((noinline)) void ")[1].split("(")[0] prg = replace(prg, src=replace_src + "/* DSP boilerplate */" + prg.src.split("/* DSP boilerplate */")[1].replace(old_name, "fxn")) ei = ExecItem(CompiledRunner(prg), [x.ensure_allocated() for x in si.bufs], si.metadata) for _ in range(run_count): ei.run(wait=True) @@ -195,9 +198,12 @@ class TestQuantizeOnnx(unittest.TestCase): }""" self.test_prequant_gemm_intacc(np.uint8, np.int8, src) + def test_prequant_gemm_intacc_32(self): + opts = [Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0)] + self.test_prequant_gemm_intacc(np.uint8, np.int8, N=32, opts=opts) def test_prequant_gemm_intacc_128(self): self.test_prequant_gemm_intacc(np.uint8, np.int8, N=128) def test_prequant_gemm_intacc_256(self): self.test_prequant_gemm_intacc(np.uint8, np.int8, N=256) - def test_prequant_gemm_intacc(self, xi=np.uint8, wi=np.uint8, replace_src=None, N=512, clip=True): + def test_prequant_gemm_intacc(self, xi=np.uint8, wi=np.uint8, replace_src=None, N=512, clip=True, opts=None): X = Tensor(m1:=(np.random.uniform(0, 255, size=(N,N)).astype(xi))).realize() W = Tensor(m2:=(np.random.uniform(0, 255, size=(N,N)).astype(wi))).realize() # ugh, it's so broken with those casts. need DONT_REALIZE_EXPAND=1 python3 test/test_quantize_onnx.py TestQuantizeOnnx.test_prequant @@ -206,7 +212,7 @@ class TestQuantizeOnnx(unittest.TestCase): out = (X.int().matmul(W.int())//1000) if clip: out = out.clip(dtypes.min(tg_dtype),dtypes.max(tg_dtype)) out = out.cast(tg_dtype) - opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)] + opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)] if opts is None else opts sexec(out, opts, replace_src, run_count=1) tout = out.numpy() mout = ((m1.astype(np.int32) @ m2.astype(np.int32)) / 1000) @@ -231,5 +237,132 @@ class TestQuantizeOnnx(unittest.TestCase): opts = [Opt(op=OptOps.UPCAST, axis=0, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)] sexec(out, opts) +@unittest.skipIf(Device.DEFAULT != "DSP", "only tests for DSP") +class TestDSPCache(unittest.TestCase): + def test_cache_speed(self): + # string becuase this breaks Python language server for syntax highlight for some reason + ast = eval("""UOp(Ops.SINK, dtypes.void, arg=None, src=( + UOp(Ops.STORE, dtypes.void, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(25088), arg=0, src=()), + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 896, 32, 1, 0), offset=0, mask=None, contiguous=True),)), src=()), + UOp(Ops.CAST, dtypes.uchar, arg=None, src=( + UOp(Ops.XOR, dtypes.int, arg=None, src=( + UOp(Ops.MAX, dtypes.int, arg=None, src=( + UOp(Ops.XOR, dtypes.int, arg=None, src=( + UOp(Ops.MAX, dtypes.int, arg=None, src=( + UOp(Ops.CAST, dtypes.int, arg=None, src=( + UOp(Ops.ADD, dtypes.float, arg=None, src=( + UOp(Ops.ADD, dtypes.float, arg=None, src=( + UOp(Ops.MUL, dtypes.float, arg=None, src=( + UOp(Ops.ADD, dtypes.float, arg=None, src=( + UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4,)), src=( + UOp(Ops.MUL, dtypes.float, arg=None, src=( + UOp(Ops.MUL, dtypes.float, arg=None, src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.CAST, dtypes.int, arg=None, src=( + UOp(Ops.LOAD, dtypes.uchar, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(150528), arg=1, src=()), + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 5376, 192, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.012368360534310341, src=( + x22:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.MUL, dtypes.float, arg=None, src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.CAST, dtypes.int, arg=None, src=( + UOp(Ops.LOAD, dtypes.char, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.char.ptr(6144), arg=2, src=()), + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 48, 4), strides=(4, 128, 1), offset=0, mask=None, contiguous=False), View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 192, 1), offset=0, mask=None, contiguous=False))), src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=0.007441135589033365, src=( + x22,)),)),)),)), + UOp(Ops.MUL, dtypes.float, arg=None, src=( + UOp(Ops.CAST, dtypes.float, arg=None, src=( + UOp(Ops.LOAD, dtypes.int, arg=None, src=( + UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(32), arg=3, src=()), + UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)), + UOp(Ops.CONST, dtypes.float, arg=9.203465015161783e-05, src=( + x36:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)), + UOp(Ops.CONST, dtypes.float, arg=33.812857328652136, src=( + x36,)),)), + UOp(Ops.CONST, dtypes.float, arg=0.4999999, src=( + x36,)),)), + UOp(Ops.CONST, dtypes.float, arg=136.0, src=( + x36,)),)),)), + UOp(Ops.CONST, dtypes.int, arg=0, src=( + x36,)),)), + x41:=UOp(Ops.CONST, dtypes.int, arg=-1, src=( + x36,)),)), + UOp(Ops.CONST, dtypes.int, arg=-256, src=( + x36,)),)), + x41,)),)),)),))""") + opts = [Opt(op=OptOps.UNROLL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=4)] + with Context(DEVECTORIZE=0, QUANTIZE=1): + k = Kernel(ast, opts=Device[Device.DEFAULT].renderer) + for opt in opts: k.apply_opt(opt) + prg = k.to_program() + #print(prg.src) + + new_src = """ +typedef int int32 __attribute__((aligned(128),vector_size(128))); +typedef signed char signed_char128 __attribute__((aligned(128),vector_size(128))); +typedef unsigned char unsigned_char8 __attribute__((aligned(8),vector_size(8))); +typedef unsigned char unsigned_char4 __attribute__((aligned(4),vector_size(4))); +typedef unsigned char unsigned_char128 __attribute__((aligned(128),vector_size(128))); +__attribute__((noinline)) void r_196_24_8_32_4(unsigned char* restrict __attribute__((align_value(128))) data0, unsigned char* restrict __attribute__((align_value(128))) data1, signed char* restrict __attribute__((align_value( +128))) data2, int* restrict __attribute__((align_value(128))) data3) { + int32 cast0 = (int32){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + int32 val0 = *((int32*)((data3+0))); + for (int ridx0 = 0; ridx0 < 196; ridx0++) { + int32 acc0 = cast0; + int32 acc1 = cast0; + int32 acc2 = cast0; + int32 acc3 = cast0; + __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768); + __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+192); + __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+384); + __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+576); + for (int ridx1 = 0; ridx1 < 24; ridx1++) { + signed_char128 val1 = *((signed_char128*)((data2+(ridx1<<8)))); + signed_char128 val2 = *((signed_char128*)((data2+((1+(ridx1<<1))<<7)))); + + int alu0 = ((ridx0*768)+(ridx1<<3)); + + unsigned_char8 val3 = *((unsigned_char8*)((data1+alu0))); + __builtin_HEXAGON_Y2_dcfetch(((data1+alu0)+16)); + unsigned_char8 val4 = *((unsigned_char8*)((data1+(alu0+192)))); + __builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+192))+16)); + unsigned_char8 val5 = *((unsigned_char8*)((data1+(alu0+384)))); + __builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+384))+16)); + unsigned_char8 val6 = *((unsigned_char8*)((data1+(alu0+576)))); + __builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+576))+16)); + + unsigned_char4 alu5 = __builtin_shufflevector(val3, val3, 0, 1, 2, 3); + unsigned_char4 alu6 = __builtin_shufflevector(val4, val4, 0, 1, 2, 3); + unsigned_char4 alu7 = __builtin_shufflevector(val5, val5, 0, 1, 2, 3); + unsigned_char4 alu8 = __builtin_shufflevector(val6, val6, 0, 1, 2, 3); + acc0 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc0, val1, (*((unsigned int*)&alu5))); + acc1 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc1, val1, (*((unsigned int*)&alu6))); + acc2 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc2, val1, (*((unsigned int*)&alu7))); + acc3 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc3, val1, (*((unsigned int*)&alu8))); + + unsigned_char4 alu9 = __builtin_shufflevector(val3, val3, 4, 5, 6, 7); + unsigned_char4 alu10 = __builtin_shufflevector(val4, val4, 4, 5, 6, 7); + unsigned_char4 alu11 = __builtin_shufflevector(val5, val5, 4, 5, 6, 7); + unsigned_char4 alu12 = __builtin_shufflevector(val6, val6, 4, 5, 6, 7); + acc0 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc0, val2, (*((unsigned int*)&alu9))); + acc1 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc1, val2, (*((unsigned int*)&alu10))); + acc2 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc2, val2, (*((unsigned int*)&alu11))); + acc3 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc3, val2, (*((unsigned int*)&alu12))); + } + unsigned_char128 alu18 = __builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B((((((acc3+val0)*203)+32767)/65536)+136), (((((acc2+val0)*203)+32767)/65536)+136)), __builtin_HEXAGON_V6_vpackwh_sat_128B((((((acc1+val0)*203)+32767)/65536)+136), (((((acc0+val0)*203)+32767)/65536)+136))); + *((unsigned_char128*)((data0+(ridx0<<7)))) = alu18; + } +} +""" + prg = replace(prg, src=new_src+prg.src.split("/* DSP boilerplate */ ")[1]) + rt = CompiledRunner(prg) + #Device.default.compiler.disassemble(rt.lib) + ei = ExecItem(rt, bufs_from_lin(k)) + tm = ei.run(wait=True) + print(f"final time {tm*1e6:.2f} us") + if __name__ == "__main__": unittest.main() diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 31d5e597cc..c592e63925 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -114,7 +114,7 @@ class Ops(FastEnum): VALID = auto(); SPECIAL = auto(); NOOP = auto() # noqa: E702 # reduce - REDUCE_AXIS = auto() + REDUCE_AXIS = auto(); REDUCE = auto() # noqa: E702 # helper ops GEP = auto(); VECTORIZE = auto(); CAT = auto(); PTRCAT = auto() # noqa: E702 diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index 76cbed5f1a..0632da7e62 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -9,7 +9,7 @@ from tinygrad.codegen.kernel import Kernel from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileRangeEvent, ProfileGraphEvent from tinygrad.dtype import dtypes -uops_colors = {Ops.LOAD: "#ffc0c0", Ops.STORE: "#87CEEB", Ops.CONST: "#e0e0e0", Ops.VCONST: "#e0e0e0", +uops_colors = {Ops.LOAD: "#ffc0c0", Ops.STORE: "#87CEEB", Ops.CONST: "#e0e0e0", Ops.VCONST: "#e0e0e0", Ops.REDUCE: "#FF5B5B", Ops.DEFINE_GLOBAL: "#ffe0b0", Ops.DEFINE_LOCAL: "#ffe0d0", Ops.DEFINE_ACC: "#f0ffe0", Ops.REDUCE_AXIS: "#FF6B6B", Ops.RANGE: "#c8a0e0", Ops.ASSIGN: "#e0ffc0", Ops.BARRIER: "#ff8080", Ops.IF: "#c8b0c0", Ops.SPECIAL: "#c0c0ff", Ops.INDEX: "#e8ffa0", Ops.WMMA: "#efefc0", Ops.VIEW: "#C8F9D4", Ops.MULTI: "#f6ccff", Ops.KERNEL: "#3e7f55",