actual tests for the dsp backend [pr] (#9102)

* actual tests for the dsp backend [pr]

* fix name
This commit is contained in:
George Hotz
2025-02-15 15:17:56 +08:00
committed by GitHub
parent 7e09057afa
commit 4672d9af73
2 changed files with 148 additions and 31 deletions

View File

@@ -3,7 +3,7 @@ import numpy as np
from extra.datasets.imagenet import get_imagenet_categories, get_val_files, center_crop
from examples.benchmark_onnx import load_onnx_model
from PIL import Image
from tinygrad import Tensor, dtypes
from tinygrad import Tensor, dtypes, GlobalCounters
from tinygrad.helpers import fetch, getenv
# works:
@@ -19,6 +19,7 @@ from tinygrad.helpers import fetch, getenv
# QUANT=1 python3 examples/test_onnx_imagenet.py
# https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx
# DONT_REALIZE_EXPAND=1 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
# VIZ=1 DONT_REALIZE_EXPAND=1 python3 examples/benchmark_onnx.py /tmp/model.quant.onnx
def imagenet_dataloader(cnt=0):
@@ -65,7 +66,8 @@ if __name__ == "__main__":
assert t_spec.shape[1:] == (3,224,224), f"shape is {t_spec.shape}"
hit = 0
for i,(img,y) in enumerate(imagenet_dataloader(cnt=100)):
for i,(img,y) in enumerate(imagenet_dataloader(cnt=getenv("CNT", 100))):
GlobalCounters.reset()
p = run_onnx_jit(**{t_name:img})
assert p.shape == (1,1000)
t = p.argmax().item()

View File

@@ -1,45 +1,53 @@
import numpy as np
import unittest
from tinygrad import Tensor, Context, Device
from dataclasses import replace
from tinygrad import Tensor, Context, Device, dtypes
from tinygrad.codegen.kernel import Kernel, Opt, OptOps
from tinygrad.engine.realize import CompiledRunner, ExecItem
N = 1024
N = 512
def create_gemm_model(model_path:str, in_size=N, out_size=N):
def create_gemm_model(model_path:str, batch_size=N, in_size=N, out_size=N, bias=False):
import onnx
from onnx import helper, numpy_helper, TensorProto
# Define input and output
input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, in_size])
output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, out_size])
input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [batch_size, in_size])
output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, [batch_size, out_size])
# Create random weights and bias
W_data = np.random.randn(in_size, out_size).astype(np.float32)
B_data = np.random.randn(out_size).astype(np.float32)
W_init = numpy_helper.from_array(W_data, name="W")
B_init = numpy_helper.from_array(B_data, name="B")
gemm_node = helper.make_node("Gemm", inputs=["input", "W", "B"], outputs=["output"], alpha=1.0, beta=1.0, transB=0)
graph_def = helper.make_graph([gemm_node], "SingleGemmGraph", [input_tensor], [output_tensor], initializer=[W_init, B_init])
if bias:
B_data = np.random.randn(out_size).astype(np.float32)
B_init = numpy_helper.from_array(B_data, name="B")
gemm_node = helper.make_node("Gemm", inputs=["input", "W", "B"], outputs=["output"], alpha=1.0, beta=1.0, transB=0)
graph_def = helper.make_graph([gemm_node], "SingleGemmGraph", [input_tensor], [output_tensor], initializer=[W_init, B_init])
else:
gemm_node = helper.make_node("Gemm", inputs=["input", "W"], outputs=["output"], alpha=1.0, beta=1.0, transB=0)
graph_def = helper.make_graph([gemm_node], "SingleGemmGraph", [input_tensor], [output_tensor], initializer=[W_init])
# Create and save the model
model_def = helper.make_model(graph_def, producer_name="single_gemm_example")
onnx.save_model(model_def, model_path)
return model_path
def sexec(out:Tensor, opts:list[Opt]):
def sexec(out:Tensor, opts:list[Opt], replace_src=None, run_count=3):
si = out.schedule()[-1]
k = Kernel(si.ast, opts=Device[Device.DEFAULT].renderer)
#opts = [Opt(op=OptOps.UPCAST, axis=0, arg=128)] #, Opt(op=OptOps.UNROLL, axis=0, arg=4)]
for opt in opts: k.apply_opt(opt)
prg = k.to_program()
if replace_src is not None:
old_name = prg.src.split("inscount();\n")[1].split("(")[0]
prg = replace(prg, src=replace_src + "/* DSP boilerplate */" + prg.src.split("/* DSP boilerplate */")[1].replace(old_name, "fxn"))
ei = ExecItem(CompiledRunner(prg), [x.ensure_allocated() for x in si.bufs], si.metadata)
for _ in range(3): ei.run(wait=True)
for _ in range(run_count): ei.run(wait=True)
@unittest.skipIf(Device.DEFAULT != "DSP", "only tests for DSP")
class TestQuantizeOnnx(unittest.TestCase):
def test_quant(self):
def test_quant_128(self): self.test_quant(128)
def test_quant(self, sz=512):
from onnxruntime.quantization import quantize_static, QuantFormat, QuantType, CalibrationDataReader
from examples.benchmark_onnx import load_onnx_model
class FakeDataReader(CalibrationDataReader):
@@ -47,15 +55,16 @@ class TestQuantizeOnnx(unittest.TestCase):
def get_next(self) -> dict:
self.cnt += 1
if self.cnt == 100: return None
return {"input": np.random.uniform(size=(1, N)).astype(np.float32)}
return {"input": np.random.uniform(size=(sz, sz)).astype(np.float32)}
out_file = "/tmp/test_out.onnx"
quantize_static(create_gemm_model("/tmp/test_in.onnx"), out_file,
FakeDataReader(), quant_format=QuantFormat.QDQ, per_channel=False,
activation_type=QuantType.QUInt8, weight_type=QuantType.QInt8,
extra_options={"ActivationSymmetric": False})
# divide is ~1500-2000 without reduce_range, 750-900 with it
quantize_static(create_gemm_model("/tmp/test_in.onnx", sz, sz, sz), out_file,
FakeDataReader(), quant_format=QuantFormat.QDQ, per_channel=False, reduce_range=False,
activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
extra_options={"ActivationSymmetric": True})
run_onnx_jit, _ = load_onnx_model(out_file)
with Context(NOOPT=1):
run_onnx_jit(input=Tensor(np.random.uniform(size=(1, N)).astype(np.float32)))
with Context(DONT_REALIZE_EXPAND=1):
run_onnx_jit(input=Tensor(np.random.uniform(size=(sz, sz)).astype(np.float32)))
def test_prequant_conv2d_1x1(self):
X = Tensor(np.random.uniform(0, 255, size=(1, 32, 128, 128)).astype(np.uint8))
@@ -66,27 +75,133 @@ class TestQuantizeOnnx(unittest.TestCase):
def test_prequant_gemm(self):
N = 512
# ugh, it's so broken with those casts. need DONT_REALIZE_EXPAND=1 python3 test/test_quantize_onnx.py TestQuantizeOnnx.test_prequant
X = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(np.uint8))
W = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(np.uint8))
out = X.matmul(W, acc_dtype=X.dtype)
opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)]
sexec(out, opts)
def test_prequant_gemm_intacc(self):
# TODO: this has to work
def test_prequant_gemm_intacc_early(self, xi=np.int8, wi=np.int8):
N = 512
X = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(xi))
W = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(wi))
with Context(DONT_REALIZE_EXPAND=1):
# this divide is interesting and forces the accumulator to actually be an int
out = (X.cast("int").matmul(W.cast("int"))//1000).cast("int8")
opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)]
sexec(out, opts)
def test_prequant_gemm_handcode(self):
src = """typedef int int128 __attribute__((aligned(512),vector_size(512)));
typedef int int32 __attribute__((aligned(128),vector_size(128)));
typedef int int64 __attribute__((aligned(256),vector_size(256)));
typedef unsigned char unsigned_char4 __attribute__((aligned(4),vector_size(4)));
typedef signed char signed_char128 __attribute__((aligned(128),vector_size(128)));
typedef unsigned char unsigned_char128 __attribute__((aligned(128),vector_size(128)));
typedef unsigned char unsigned_char256 __attribute__((aligned(256),vector_size(256)));
union V256 {
unsigned_char256 vec256;
struct {
unsigned_char128 lo128;
unsigned_char128 hi128;
};
};
__attribute__((noinline)) void fxn(unsigned char* restrict __attribute__((align_value(128))) data0,
unsigned char* restrict __attribute__((align_value(128))) data1,
signed char* restrict __attribute__((align_value(128))) data2) {
for (int ridx0 = 0; ridx0 < 512; ridx0++) {
int alu0 = (ridx0<<9);
for (int ridx1 = 0; ridx1 < 4; ridx1++) {
int alu1 = (ridx1<<7);
int32 acc0 = __builtin_HEXAGON_V6_vd0_128B();
int32 acc1 = __builtin_HEXAGON_V6_vd0_128B();
int32 acc2 = __builtin_HEXAGON_V6_vd0_128B();
int32 acc3 = __builtin_HEXAGON_V6_vd0_128B();
for (int ridx2 = 0; ridx2 < 128; ridx2++) {
unsigned_char4 val0 = *((unsigned_char4*)((data1+(alu0+(ridx2<<2)))));
int alu2 = (alu1+(ridx2<<11));
signed_char128 x0 = *((signed_char128*)((data2+alu2)));
signed_char128 x1 = *((signed_char128*)((data2+(alu2+512))));
signed_char128 x2 = *((signed_char128*)((data2+(alu2+1024))));
signed_char128 x3 = *((signed_char128*)((data2+(alu2+1536))));
union V256 ss01;
// ss01.lo128 = (x0[0], x1[0], x0[2], x1[2], x0[4], x1[4], ...)
// ss01.hi128 = (x0[1], x1[1], x0[3], x1[3], x0[5], x1[5], ...)
ss01.vec256 = __builtin_HEXAGON_V6_vshufoeb_128B(x1, x0);
union V256 ss23;
// ss23.lo128 = (x2[0], x3[0], x2[2], x3[2], x2[4], x3[4], ...)
// ss23.hi128 = (x2[1], x3[1], x2[3], x3[3], x2[5], x3[5], ...)
ss23.vec256 = __builtin_HEXAGON_V6_vshufoeb_128B(x3, x2);
union V256 sslo;
// sslo.lo128 = (x0[0], x1[0], x2[0], x3[0], x0[4], x1[4], ...)
// sslo.hi128 = (x0[2], x1[2], x2[2], x3[2], x0[6], x1[6], ...)
sslo.vec256 = __builtin_HEXAGON_V6_vdealvdd_128B(ss23.lo128, ss01.lo128, 2);
union V256 sshi;
// sshi.lo128 = (x0[1], x1[1], x2[1], x3[1], x0[5], x1[5], ...)
// sshi.hi128 = (x0[3], x1[3], x2[3], x3[3], x0[7], x1[7], ...)
sshi.vec256 = __builtin_HEXAGON_V6_vdealvdd_128B(ss23.hi128, ss01.hi128, 2);
//unsigned_char128 w0 = (unsigned_char128){val0[0],val0[1],val0[2],val0[3],val0[0],val0[1],val0[2],val0[3],...
unsigned_char128 w0 = __builtin_HEXAGON_V6_lvsplatw_128B(*((unsigned int*)&val0));
acc0 = __builtin_HEXAGON_V6_vrmpybusv_acc_128B(acc0, w0, sslo.lo128);
acc1 = __builtin_HEXAGON_V6_vrmpybusv_acc_128B(acc1, w0, sshi.lo128);
acc2 = __builtin_HEXAGON_V6_vrmpybusv_acc_128B(acc2, w0, sslo.hi128);
acc3 = __builtin_HEXAGON_V6_vrmpybusv_acc_128B(acc3, w0, sshi.hi128);
}
acc0 /= 1000;
acc1 /= 1000;
acc2 /= 1000;
acc3 /= 1000;
// ','.join([f"acc{j}[{i}]" for i in range(32) for j in range(4)])
// acc0[0], acc0[1], acc0[2], ..... acc3[30], acc3[31]
unsigned_char128 packed = __builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B(acc3, acc2),
__builtin_HEXAGON_V6_vpackwh_sat_128B(acc1, acc0));
packed = __builtin_HEXAGON_V6_vshuffb_128B(packed);
packed = __builtin_HEXAGON_V6_vshuffb_128B(packed);
// acc0[0], acc1[0], acc2[0], ..... acc2[31], acc3[31]
*((unsigned_char128*)((data0+(alu0+alu1)))) = packed;
}
}
}"""
self.test_prequant_gemm_intacc(np.uint8, np.int8, src)
def test_prequant_gemm_intacc_128(self): self.test_prequant_gemm_intacc(np.uint8, np.int8, N=128)
def test_prequant_gemm_intacc_256(self): self.test_prequant_gemm_intacc(np.uint8, np.int8, N=256)
def test_prequant_gemm_intacc(self, xi=np.uint8, wi=np.uint8, replace_src=None, N=512, clip=True):
X = Tensor(m1:=(np.random.uniform(0, 255, size=(N,N)).astype(xi))).realize()
W = Tensor(m2:=(np.random.uniform(0, 255, size=(N,N)).astype(wi))).realize()
# ugh, it's so broken with those casts. need DONT_REALIZE_EXPAND=1 python3 test/test_quantize_onnx.py TestQuantizeOnnx.test_prequant
X = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(np.uint8))
W = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(np.int8))
out = X.matmul(W)
opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)]
sexec(out, opts)
tg_dtype = dtypes.int8 if xi == np.int8 else dtypes.uint8
with Context(DONT_REALIZE_EXPAND=1):
out = (X.int().matmul(W.int())//1000)
if clip: out = out.clip(dtypes.min(tg_dtype),dtypes.max(tg_dtype))
out = out.cast(tg_dtype)
opts = [Opt(op=OptOps.UPCAST, axis=1, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)]
sexec(out, opts, replace_src, run_count=1)
tout = out.numpy()
mout = ((m1.astype(np.int32) @ m2.astype(np.int32)) / 1000)
if clip: mout = mout.clip(dtypes.min(tg_dtype),dtypes.max(tg_dtype))
mout = mout.astype(xi)
print(tout)
print(mout)
np.testing.assert_equal(tout, mout)
def test_prequant_gemm_intacc_wi(self): self.test_prequant_gemm_intacc(wi=np.int8)
def test_prequant_gemm_intacc_xiwi(self): self.test_prequant_gemm_intacc(xi=np.int8, wi=np.int8)
def test_prequant_gemm_intacc_xiwi_noclip(self): self.test_prequant_gemm_intacc(xi=np.int8, wi=np.int8, clip=False)
def test_prequant_gemv(self):
N = 2048
# ugh, it's so broken with those casts. need DONT_REALIZE_EXPAND=1 python3 test/test_quantize_onnx.py TestQuantizeOnnx.test_prequant
X = Tensor(np.random.uniform(0, 255, size=(1,N)).astype(np.uint8))
W = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(np.uint8))
X = Tensor(np.random.uniform(0, 255, size=(1,N)).astype(np.uint8)).realize()
W = Tensor(np.random.uniform(0, 255, size=(N,N)).astype(np.uint8)).realize()
#out = X.cast(dtypes.int) @ W.cast(dtypes.int)
#out = X @ W
out = X.matmul(W, acc_dtype=X.dtype)