switch quantization to unsigned/unsigned + add Ops.REDUCE (#9527)

* switch quantization to unsigned/unsigned + add Ops.REDUCE * tests * nhwc + replay pkl
2026-01-10 07:28:15 -05:00 · 2025-03-21 17:02:37 +08:00
parent a35b0a88bf
commit 8e555c586c
6 changed files with 238 additions and 11 deletions
--- a/test/test_quantize_onnx.py
+++ b/test/test_quantize_onnx.py
@@ -1,10 +1,13 @@
+# ruff: noqa: E501
 import numpy as np
 import unittest
 from dataclasses import replace
 from tinygrad import Tensor, Context, Device, dtypes
-from tinygrad.ops import Ops
+from tinygrad.ops import Ops, UOp # noqa: F401 # pylint: disable=unused-import
 from tinygrad.codegen.kernel import Kernel, Opt, OptOps
 from tinygrad.engine.realize import CompiledRunner, ExecItem, lower_schedule_item
+from tinygrad.engine.search import bufs_from_lin
+from tinygrad.shape.shapetracker import ShapeTracker, View # noqa: F401 # pylint: disable=unused-import

 N = 512

@@ -40,7 +43,7 @@ def sexec(out:Tensor, opts:list[Opt], replace_src=None, run_count=3):
  for opt in opts: k.apply_opt(opt)
  prg = k.to_program()
  if replace_src is not None:
-    old_name = prg.src.split("inscount();\n")[1].split("(")[0]
+    old_name = prg.src.split("__attribute__((noinline)) void ")[1].split("(")[0]
    prg = replace(prg, src=replace_src + "/* DSP boilerplate */" + prg.src.split("/* DSP boilerplate */")[1].replace(old_name, "fxn"))
  ei = ExecItem(CompiledRunner(prg), [x.ensure_allocated() for x in si.bufs], si.metadata)
  for _ in range(run_count): ei.run(wait=True)
@@ -195,9 +198,12 @@ class TestQuantizeOnnx(unittest.TestCase):
    }"""
    self.test_prequant_gemm_intacc(np.uint8, np.int8, src)

+  def test_prequant_gemm_intacc_32(self):
+    opts = [Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0)]
+    self.test_prequant_gemm_intacc(np.uint8, np.int8, N=32, opts=opts)
  def test_prequant_gemm_intacc_128(self): self.test_prequant_gemm_intacc(np.uint8, np.int8, N=128)
  def test_prequant_gemm_intacc_256(self): self.test_prequant_gemm_intacc(np.uint8, np.int8, N=256)
-  def test_prequant_gemm_intacc(self, xi=np.uint8, wi=np.uint8, replace_src=None, N=512, clip=True):
+  def test_prequant_gemm_intacc(self, xi=np.uint8, wi=np.uint8, replace_src=None, N=512, clip=True, opts=None):
    X = Tensor(m1:=(np.random.uniform(0, 255, size=(N,N)).astype(xi))).realize()
    W = Tensor(m2:=(np.random.uniform(0, 255, size=(N,N)).astype(wi))).realize()
    # ugh, it's so broken with those casts. need DONT_REALIZE_EXPAND=1 python3 test/test_quantize_onnx.py TestQuantizeOnnx.test_prequant
@@ -206,7 +212,7 @@ class TestQuantizeOnnx(unittest.TestCase):
      out = (X.int().matmul(W.int())//1000)
      if clip: out = out.clip(dtypes.min(tg_dtype),dtypes.max(tg_dtype))
      out = out.cast(tg_dtype)
-      opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)]
+      opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)] if opts is None else opts
      sexec(out, opts, replace_src, run_count=1)
    tout = out.numpy()
    mout = ((m1.astype(np.int32) @ m2.astype(np.int32)) / 1000)
@@ -231,5 +237,132 @@ class TestQuantizeOnnx(unittest.TestCase):
    opts = [Opt(op=OptOps.UPCAST, axis=0, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)]
    sexec(out, opts)

+@unittest.skipIf(Device.DEFAULT != "DSP", "only tests for DSP")
+class TestDSPCache(unittest.TestCase):
+  def test_cache_speed(self):
+    # string becuase this breaks Python language server for syntax highlight for some reason
+    ast = eval("""UOp(Ops.SINK, dtypes.void, arg=None, src=(
+      UOp(Ops.STORE, dtypes.void, arg=None, src=(
+        UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(25088), arg=0, src=()),
+        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 896, 32, 1, 0), offset=0, mask=None, contiguous=True),)), src=()),
+        UOp(Ops.CAST, dtypes.uchar, arg=None, src=(
+          UOp(Ops.XOR, dtypes.int, arg=None, src=(
+            UOp(Ops.MAX, dtypes.int, arg=None, src=(
+              UOp(Ops.XOR, dtypes.int, arg=None, src=(
+                UOp(Ops.MAX, dtypes.int, arg=None, src=(
+                  UOp(Ops.CAST, dtypes.int, arg=None, src=(
+                    UOp(Ops.ADD, dtypes.float, arg=None, src=(
+                      UOp(Ops.ADD, dtypes.float, arg=None, src=(
+                        UOp(Ops.MUL, dtypes.float, arg=None, src=(
+                          UOp(Ops.ADD, dtypes.float, arg=None, src=(
+                            UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4,)), src=(
+                              UOp(Ops.MUL, dtypes.float, arg=None, src=(
+                                UOp(Ops.MUL, dtypes.float, arg=None, src=(
+                                  UOp(Ops.CAST, dtypes.float, arg=None, src=(
+                                    UOp(Ops.CAST, dtypes.int, arg=None, src=(
+                                      UOp(Ops.LOAD, dtypes.uchar, arg=None, src=(
+                                        UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(150528), arg=1, src=()),
+                                        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 5376, 192, 0, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),
+                                  UOp(Ops.CONST, dtypes.float, arg=0.012368360534310341, src=(
+                                    x22:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),
+                                UOp(Ops.MUL, dtypes.float, arg=None, src=(
+                                  UOp(Ops.CAST, dtypes.float, arg=None, src=(
+                                    UOp(Ops.CAST, dtypes.int, arg=None, src=(
+                                      UOp(Ops.LOAD, dtypes.char, arg=None, src=(
+                                        UOp(Ops.DEFINE_GLOBAL, dtypes.char.ptr(6144), arg=2, src=()),
+                                        UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(32, 48, 4), strides=(4, 128, 1), offset=0, mask=None, contiguous=False), View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 192, 1), offset=0, mask=None, contiguous=False))), src=()),)),)),)),
+                                  UOp(Ops.CONST, dtypes.float, arg=0.007441135589033365, src=(
+                                    x22,)),)),)),)),
+                            UOp(Ops.MUL, dtypes.float, arg=None, src=(
+                              UOp(Ops.CAST, dtypes.float, arg=None, src=(
+                                UOp(Ops.LOAD, dtypes.int, arg=None, src=(
+                                  UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(32), arg=3, src=()),
+                                  UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),
+                              UOp(Ops.CONST, dtypes.float, arg=9.203465015161783e-05, src=(
+                                x36:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),
+                          UOp(Ops.CONST, dtypes.float, arg=33.812857328652136, src=(
+                            x36,)),)),
+                        UOp(Ops.CONST, dtypes.float, arg=0.4999999, src=(
+                          x36,)),)),
+                      UOp(Ops.CONST, dtypes.float, arg=136.0, src=(
+                        x36,)),)),)),
+                  UOp(Ops.CONST, dtypes.int, arg=0, src=(
+                    x36,)),)),
+                x41:=UOp(Ops.CONST, dtypes.int, arg=-1, src=(
+                  x36,)),)),
+              UOp(Ops.CONST, dtypes.int, arg=-256, src=(
+                x36,)),)),
+            x41,)),)),)),))""")
+    opts = [Opt(op=OptOps.UNROLL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=4)]
+    with Context(DEVECTORIZE=0, QUANTIZE=1):
+      k = Kernel(ast, opts=Device[Device.DEFAULT].renderer)
+      for opt in opts: k.apply_opt(opt)
+      prg = k.to_program()
+      #print(prg.src)
+
+    new_src = """
+typedef int int32 __attribute__((aligned(128),vector_size(128)));
+typedef signed char signed_char128 __attribute__((aligned(128),vector_size(128)));
+typedef unsigned char unsigned_char8 __attribute__((aligned(8),vector_size(8)));
+typedef unsigned char unsigned_char4 __attribute__((aligned(4),vector_size(4)));
+typedef unsigned char unsigned_char128 __attribute__((aligned(128),vector_size(128)));
+__attribute__((noinline)) void r_196_24_8_32_4(unsigned char* restrict __attribute__((align_value(128))) data0, unsigned char* restrict __attribute__((align_value(128))) data1, signed char* restrict __attribute__((align_value(
+128))) data2, int* restrict __attribute__((align_value(128))) data3) {
+  int32 cast0 = (int32){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  int32 val0 = *((int32*)((data3+0)));
+  for (int ridx0 = 0; ridx0 < 196; ridx0++) {
+    int32 acc0 = cast0;
+    int32 acc1 = cast0;
+    int32 acc2 = cast0;
+    int32 acc3 = cast0;
+    __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768);
+    __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+192);
+    __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+384);
+    __builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+576);
+    for (int ridx1 = 0; ridx1 < 24; ridx1++) {
+      signed_char128 val1 = *((signed_char128*)((data2+(ridx1<<8))));
+      signed_char128 val2 = *((signed_char128*)((data2+((1+(ridx1<<1))<<7))));
+
+      int alu0 = ((ridx0*768)+(ridx1<<3));
+
+      unsigned_char8 val3 = *((unsigned_char8*)((data1+alu0)));
+      __builtin_HEXAGON_Y2_dcfetch(((data1+alu0)+16));
+      unsigned_char8 val4 = *((unsigned_char8*)((data1+(alu0+192))));
+      __builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+192))+16));
+      unsigned_char8 val5 = *((unsigned_char8*)((data1+(alu0+384))));
+      __builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+384))+16));
+      unsigned_char8 val6 = *((unsigned_char8*)((data1+(alu0+576))));
+      __builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+576))+16));
+
+      unsigned_char4 alu5 = __builtin_shufflevector(val3, val3,    0,   1,   2,   3);
+      unsigned_char4 alu6 = __builtin_shufflevector(val4, val4,    0,   1,   2,   3);
+      unsigned_char4 alu7 = __builtin_shufflevector(val5, val5,    0,   1,   2,   3);
+      unsigned_char4 alu8 = __builtin_shufflevector(val6, val6,    0,   1,   2,   3);
+      acc0 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc0, val1, (*((unsigned int*)&alu5)));
+      acc1 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc1, val1, (*((unsigned int*)&alu6)));
+      acc2 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc2, val1, (*((unsigned int*)&alu7)));
+      acc3 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc3, val1, (*((unsigned int*)&alu8)));
+
+      unsigned_char4 alu9 = __builtin_shufflevector(val3, val3,    4,   5,   6,   7);
+      unsigned_char4 alu10 = __builtin_shufflevector(val4, val4,    4,   5,   6,   7);
+      unsigned_char4 alu11 = __builtin_shufflevector(val5, val5,    4,   5,   6,   7);
+      unsigned_char4 alu12 = __builtin_shufflevector(val6, val6,    4,   5,   6,   7);
+      acc0 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc0, val2, (*((unsigned int*)&alu9)));
+      acc1 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc1, val2, (*((unsigned int*)&alu10)));
+      acc2 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc2, val2, (*((unsigned int*)&alu11)));
+      acc3 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc3, val2, (*((unsigned int*)&alu12)));
+    }
+    unsigned_char128 alu18 = __builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B((((((acc3+val0)*203)+32767)/65536)+136), (((((acc2+val0)*203)+32767)/65536)+136)), __builtin_HEXAGON_V6_vpackwh_sat_128B((((((acc1+val0)*203)+32767)/65536)+136), (((((acc0+val0)*203)+32767)/65536)+136)));
+    *((unsigned_char128*)((data0+(ridx0<<7)))) = alu18;
+  }
+}
+"""
+    prg = replace(prg, src=new_src+prg.src.split("/* DSP boilerplate */ ")[1])
+    rt = CompiledRunner(prg)
+    #Device.default.compiler.disassemble(rt.lib)
+    ei = ExecItem(rt, bufs_from_lin(k))
+    tm = ei.run(wait=True)
+    print(f"final time {tm*1e6:.2f} us")
+
 if __name__ == "__main__":
  unittest.main()