dsp stuff / sniff ioctls from snpe (#9490)

* sniff ioctls from snpe * dump input buffers * snpe logs from dsp * NHWC support * knum 3 * this run? * revert those --------- Co-authored-by: Comma Device <device@comma.ai>
2026-01-06 21:53:53 -05:00 · 2025-03-20 10:38:23 +08:00
parent 2223b93338
commit 68053d0510
9 changed files with 1075 additions and 3 deletions
--- a/extra/dsp/hook.py
+++ b/extra/dsp/hook.py
@@ -0,0 +1,101 @@
+import os
+print("from import")
+del os.environ["LD_PRELOAD"]
+import ctypes, ctypes.util
+from extra.dsp.run import install_hook, ioctl, libc, get_struct, qcom_dsp, format_struct, to_mv, hexdump
+
+@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
+def _mmap(addr, length, prot, flags, fd, offset):
+  mmap_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
+  orig_mmap = mmap_type(ctypes.addressof(orig_mmap_mv))
+  ret = orig_mmap(addr, length, prot, flags, fd, offset)
+  # ll = os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else ""
+  print(f"mmap {addr=}, {length=}, {prot=}, {flags=}, {fd=}, {offset=} {ret=}")
+  return ret
+
+#install_hook(libc.ioctl, ioctl)
+#orig_mmap_mv = install_hook(libc.mmap, _mmap)
+print("import done")
+import mmap
+
+alloc_sizes = {}
+mmaped = {}
+
+def handle_ioctl(fd, request, argp, ret):
+  fn = os.readlink(f"/proc/self/fd/{fd}")
+  idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
+
+  if fn == "/dev/ion":
+    if nr == 0:
+      st = get_struct(argp, qcom_dsp.struct_ion_allocation_data)
+      print(ret, "ION_IOC_ALLOC", format_struct(st))
+      alloc_sizes[st.handle] = st.len
+    elif nr == 1:
+      st = get_struct(argp, qcom_dsp.struct_ion_handle_data)
+      print(ret, "ION_IOC_FREE", format_struct(st))
+      if st.handle in alloc_sizes: del alloc_sizes[st.handle]
+      if st.handle in mmaped: del mmaped[st.handle]
+    elif nr == 2:
+      st = get_struct(argp, qcom_dsp.struct_ion_fd_data)
+      print(ret, "ION_IOC_MAP", format_struct(st))
+      mmaped[st.handle] = mmap.mmap(st.fd, alloc_sizes[st.handle])
+  elif fn == "/dev/adsprpc-smd":
+    assert chr(itype) == 'R'
+    if nr == 8:
+      st = ctypes.c_uint32.from_address(argp)
+      print(ret, "FASTRPC_IOCTL_GETINFO", st.value)
+    elif nr == 2:
+      st = get_struct(argp, qcom_dsp.struct_fastrpc_ioctl_mmap)
+      print(ret, "FASTRPC_IOCTL_MMAP", format_struct(st))
+    elif nr == 1:
+      # https://research.checkpoint.com/2021/pwn2own-qualcomm-dsp/
+      st = get_struct(argp, qcom_dsp.struct_fastrpc_ioctl_invoke)
+      print(ret, "FASTRPC_IOCTL_INVOKE", format_struct(st))
+      # 0xFF000000 = Method index and attribute (the highest byte)
+      # 0x00FF0000 = Number of input arguments
+      # 0x0000FF00 = Number of output arguments
+      # 0x000000F0 = Number of input handles
+      # 0x0000000F = Number of output handles
+
+      method = (st.sc>>24) & 0xFF
+      in_args = (st.sc>>16) & 0xFF
+      out_args = (st.sc>>8) & 0xFF
+      in_h = (st.sc>>4) & 0xF
+      out_h = (st.sc>>0) & 0xF
+      print(f"\tm:{method} ia:{in_args} oa:{out_args} ih:{in_h} oh:{out_h}")
+      """
+      if in_args or out_args:
+        for arg in range(in_args+out_args):
+          print(arg, format_struct(st.pra[arg]))
+          if st.pra[arg].buf.pv is not None:
+            ww = to_mv(st.pra[arg].buf.pv, st.pra[arg].buf.len)
+            hexdump(to_mv(st.pra[arg].buf.pv, st.pra[arg].buf.len)[:0x40])
+      """
+    elif nr == 6:
+      print(ret, "FASTRPC_IOCTL_INIT", format_struct(ini:=get_struct(argp, qcom_dsp.struct_fastrpc_ioctl_init)))
+      print(os.readlink(f"/proc/self/fd/{ini.filefd}"))
+      # print(bytearray(to_mv(ini.file, ini.filelen)))
+    elif nr == 7:
+      print(ret, "FASTRPC_IOCTL_INVOKE_ATTRS", format_struct(ini:=get_struct(argp, qcom_dsp.struct_fastrpc_ioctl_invoke_attrs)))
+    elif nr == 12: print(ret, "FASTRPC_IOCTL_CONTROL", format_struct(get_struct(argp, qcom_dsp.struct_fastrpc_ioctl_control)))
+    elif nr == 4:
+      st_fd = get_struct(argp, qcom_dsp.struct_fastrpc_ioctl_invoke_fd)
+      st = st_fd.inv
+      print(ret, "FASTRPC_IOCTL_INVOKE_FD", format_struct(st))
+
+      method = (st.sc>>24) & 0xFF
+      in_args = (st.sc>>16) & 0xFF
+      out_args = (st.sc>>8) & 0xFF
+      in_h = (st.sc>>4) & 0xF
+      out_h = (st.sc>>0) & 0xF
+      print(f"\tm:{method} ia:{in_args} oa:{out_args} ih:{in_h} oh:{out_h}")
+
+      if st.sc in [0x2030200, 0x3040300]:
+        for handle, mapped in mmaped.items():
+          print(f" buffer {handle} {alloc_sizes[handle]:X}")
+          with open(f"/tmp/buf_{st.sc:X}_{handle}_{alloc_sizes[handle]:X}", "wb") as f: f.write(mapped)
+    else:
+      print(f"{ret} UNPARSED {nr}")
+  else:
+    print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", fn)
+
--- a/extra/dsp/preload.c
+++ b/extra/dsp/preload.c
@@ -0,0 +1,79 @@
+__attribute__((constructor))
+void preload_init() {
+  Py_Initialize();
+  PyRun_SimpleString("print('hello from c'); import extra.dsp.hook");
+}
+
+#define _GNU_SOURCE  // Must be defined before any includes for RTLD_NEXT
+#include <stdio.h>
+#include <dlfcn.h>
+#include <Python.h>  // Include Python header
+//#include <sys/ioctl.h>
+
+// Define the original ioctl function pointer
+static int (*real_ioctl)(int fd, unsigned long request, void *arg) = NULL;
+
+// Our custom ioctl hook
+int ioctl(int fd, unsigned long request, void *arg) {
+	// Initialize the real ioctl function pointer on first call
+	if (!real_ioctl) {
+		real_ioctl = dlsym(RTLD_NEXT, "ioctl");
+		if (!real_ioctl) {
+			fprintf(stderr, "Error: Could not find real ioctl\n");
+			return -1;
+		}
+	}
+
+	// Log the call
+	//printf("Hooked ioctl: tid=%d fd=%d, request=0x%lx, arg=%p\n", gettid(), fd, request, arg);
+
+	// Call a Python function from extra.dsp.hook
+	PyObject *pName, *pModule, *pFunc, *pArgs, *pValue;
+	PyGILState_STATE gstate;
+
+	// Ensure the GIL is held (required for Python calls in multi-threaded apps)
+	//gstate = PyGILState_Ensure();
+
+	// Import the module
+	pName = PyUnicode_FromString("extra.dsp.hook");
+	pModule = PyImport_Import(pName);
+	Py_DECREF(pName);
+
+	// Call the original ioctl
+	int ret = real_ioctl(fd, request, arg);
+
+	if (pModule != NULL) {
+		// Get the function (assume it’s called "handle_ioctl")
+		pFunc = PyObject_GetAttrString(pModule, "handle_ioctl");
+
+		if (pFunc && PyCallable_Check(pFunc)) {
+			// Create arguments tuple (fd, request, arg, ret)
+			pArgs = PyTuple_Pack(4,
+													 PyLong_FromLong(fd),
+													 PyLong_FromUnsignedLong(request),
+													 PyLong_FromVoidPtr(arg),
+													 PyLong_FromLong(ret));
+			pValue = PyObject_CallObject(pFunc, pArgs);
+			Py_DECREF(pArgs);
+
+			if (pValue != NULL) {
+				Py_DECREF(pValue);
+			} else {
+				PyErr_Print();  // Print Python error if call fails
+			}
+			Py_DECREF(pFunc);
+		} else {
+			if (PyErr_Occurred()) PyErr_Print();
+			fprintf(stderr, "Cannot find function 'handle_ioctl'\n");
+		}
+		Py_DECREF(pModule);
+	} else {
+			PyErr_Print();
+		fprintf(stderr, "Failed to load 'extra.dsp.hook'\n");
+	}
+
+	// Release the GIL
+	//PyGILState_Release(gstate);
+	return ret;
+}
+
--- a/extra/dsp/run.py
+++ b/extra/dsp/run.py
@@ -112,7 +112,7 @@ def install_hook(c_function, python_function):
  return orig_func

 libc = ctypes.CDLL(ctypes.util.find_library("libc"))
-install_hook(libc.ioctl, ioctl)
+#install_hook(libc.ioctl, ioctl)
 adsp = ctypes.CDLL(ctypes.util.find_library("adsprpc"))

 def send_rpc_invoke(filename):
--- a/extra/dsp/snpe.sh
+++ b/extra/dsp/snpe.sh
@@ -0,0 +1,11 @@
+#!/bin/bash -e
+echo "building"
+gcc -shared -fPIC -o preload_python.so preload.c -L/usr/local/pyenv/versions/3.11.4/lib -lpython3.11 -I/usr/local/pyenv/versions/3.11.4/include/python3.11
+echo "compiled"
+export LD_LIBRARY_PATH="/usr/local/pyenv/versions/3.11.4/lib;/data/snpe"
+export LD_PRELOAD="$PWD/preload_python.so"
+export PYTHONPATH="/data/tinygrad"
+cd /data/snpe
+#ADSP_LIBRARY_PATH="." strace -f -e ioctl ./snpe-net-run --container MobileNetV2.dlc --input_list hello --use_dsp
+ADSP_LIBRARY_PATH="." ./snpe-net-run --container MobileNetV2.dlc --input_list hello --use_dsp
+
--- a/extra/dsp/snpe_logs/dlc_info_2
+++ b/extra/dsp/snpe_logs/dlc_info_2
@@ -0,0 +1,715 @@
+DLC info for: /home/batman/xx/ml_tools/snpe/snpe-1.61.0.3358/mobilenetv2-7.dlc
+Model Version: N/A
+Model Copyright:N/A
+-----------------------------------------------------------------------------------------------------------------------------------------
+| Id  | Name                 | Type                  | Inputs  | Outputs | Out Dims     | Runtimes | Parameters                         |
+-----------------------------------------------------------------------------------------------------------------------------------------
+| 0   | input                | data                  | input   | input   | 1x224x224x3  | A D G C  | input_preprocessing: passthrough   |
+|     |                      |                       |         |         |              |          | input_type: image                  |
+| 1   | Conv_0               | convolutional         | input   | 474     | 1x112x112x32 | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 2                        |
+|     |                      |                       |         |         |              |          | stride y: 2                        |
+|     |                      |                       |         |         |              |          | num filters: 32                    |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | param count: 896 (0.0257%)         |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 2   | Clip_1               | neuron                | 474     | 317     | 1x112x112x32 | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 3   | Conv_2               | convolutional         | 317     | 477     | 1x112x112x32 | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 32                    |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 32                         |
+|     |                      |                       |         |         |              |          | param count: 320 (0.00917%)        |
+|     |                      |                       |         |         |              |          | MACs per inference: 3M (1.2%)      |
+| 4   | Clip_3               | neuron                | 477     | 320     | 1x112x112x32 | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 5   | Conv_4               | convolutional         | 320     | 480     | 1x112x112x16 | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 16                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 528 (0.0151%)         |
+|     |                      |                       |         |         |              |          | MACs per inference: 6M (2.13%)     |
+| 6   | Conv_5               | convolutional         | 480     | 483     | 1x112x112x96 | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 96                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 1k (0.0468%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 19M (6.4%)     |
+| 7   | Clip_6               | neuron                | 483     | 325     | 1x112x112x96 | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 8   | Conv_7               | convolutional         | 325     | 486     | 1x56x56x96   | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 2                        |
+|     |                      |                       |         |         |              |          | stride y: 2                        |
+|     |                      |                       |         |         |              |          | num filters: 96                    |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 96                         |
+|     |                      |                       |         |         |              |          | param count: 960 (0.0275%)         |
+|     |                      |                       |         |         |              |          | MACs per inference: 2M (0.9%)      |
+| 9   | Clip_8               | neuron                | 486     | 328     | 1x56x56x96   | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 10  | Conv_9               | convolutional         | 328     | 489     | 1x56x56x24   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 24                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 2k (0.0667%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 7M (2.4%)      |
+| 11  | Conv_10              | convolutional         | 489     | 492     | 1x56x56x144  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 144                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 3k (0.103%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 12  | Clip_11              | neuron                | 492     | 333     | 1x56x56x144  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 13  | Conv_12              | convolutional         | 333     | 495     | 1x56x56x144  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 144                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 144                        |
+|     |                      |                       |         |         |              |          | param count: 1k (0.0413%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.35%)     |
+| 14  | Clip_13              | neuron                | 495     | 336     | 1x56x56x144  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 15  | Conv_14              | convolutional         | 336     | 498     | 1x56x56x24   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 24                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 3k (0.0998%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 16  | Add_15               | elementwise_binary_op | 489     | 339     | 1x56x56x24   | A D G C  | operation: sum                     |
+|     |                      |                       | 498     |         |              |          | MACs per inference: 75k (0.025%)   |
+| 17  | Conv_16              | convolutional         | 339     | 501     | 1x56x56x144  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 144                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 3k (0.103%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 18  | Clip_17              | neuron                | 501     | 342     | 1x56x56x144  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 19  | Conv_18              | convolutional         | 342     | 504     | 1x28x28x144  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 2                        |
+|     |                      |                       |         |         |              |          | stride y: 2                        |
+|     |                      |                       |         |         |              |          | num filters: 144                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 144                        |
+|     |                      |                       |         |         |              |          | param count: 1k (0.0413%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 1M (0.338%)    |
+| 20  | Clip_19              | neuron                | 504     | 345     | 1x28x28x144  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 21  | Conv_20              | convolutional         | 345     | 507     | 1x28x28x32   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 32                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 4k (0.133%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 3M (1.2%)      |
+| 22  | Conv_21              | convolutional         | 507     | 510     | 1x28x28x192  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 192                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 6k (0.182%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 23  | Clip_22              | neuron                | 510     | 350     | 1x28x28x192  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 24  | Conv_23              | convolutional         | 350     | 513     | 1x28x28x192  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 192                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 192                        |
+|     |                      |                       |         |         |              |          | param count: 1k (0.055%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 1M (0.45%)     |
+| 25  | Clip_24              | neuron                | 513     | 353     | 1x28x28x192  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 26  | Conv_25              | convolutional         | 353     | 516     | 1x28x28x32   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 32                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 6k (0.177%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 27  | Add_26               | elementwise_binary_op | 507     | 356     | 1x28x28x32   | A D G C  | operation: sum                     |
+|     |                      |                       | 516     |         |              |          | MACs per inference: 25k (0.00833%) |
+| 28  | Conv_27              | convolutional         | 356     | 519     | 1x28x28x192  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 192                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 6k (0.182%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 29  | Clip_28              | neuron                | 519     | 359     | 1x28x28x192  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 30  | Conv_29              | convolutional         | 359     | 522     | 1x28x28x192  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 192                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 192                        |
+|     |                      |                       |         |         |              |          | param count: 1k (0.055%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 1M (0.45%)     |
+| 31  | Clip_30              | neuron                | 522     | 362     | 1x28x28x192  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 32  | Conv_31              | convolutional         | 362     | 525     | 1x28x28x32   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 32                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 6k (0.177%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 33  | Add_32               | elementwise_binary_op | 356     | 365     | 1x28x28x32   | A D G C  | operation: sum                     |
+|     |                      |                       | 525     |         |              |          | MACs per inference: 25k (0.00833%) |
+| 34  | Conv_33              | convolutional         | 365     | 528     | 1x28x28x192  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 192                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 6k (0.182%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 35  | Clip_34              | neuron                | 528     | 368     | 1x28x28x192  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 36  | Conv_35              | convolutional         | 368     | 531     | 1x14x14x192  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 2                        |
+|     |                      |                       |         |         |              |          | stride y: 2                        |
+|     |                      |                       |         |         |              |          | num filters: 192                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 192                        |
+|     |                      |                       |         |         |              |          | param count: 1k (0.055%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 338k (0.113%)  |
+| 37  | Clip_36              | neuron                | 531     | 371     | 1x14x14x192  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 38  | Conv_37              | convolutional         | 371     | 534     | 1x14x14x64   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 64                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 12k (0.354%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 2M (0.8%)      |
+| 39  | Conv_38              | convolutional         | 534     | 537     | 1x14x14x384  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 24k (0.716%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 40  | Clip_39              | neuron                | 537     | 376     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 41  | Conv_40              | convolutional         | 376     | 540     | 1x14x14x384  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 384                        |
+|     |                      |                       |         |         |              |          | param count: 3k (0.11%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 677k (0.225%)  |
+| 42  | Clip_41              | neuron                | 540     | 379     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 43  | Conv_42              | convolutional         | 379     | 543     | 1x14x14x64   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 64                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 24k (0.706%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 44  | Add_43               | elementwise_binary_op | 534     | 382     | 1x14x14x64   | A D G C  | operation: sum                     |
+|     |                      |                       | 543     |         |              |          | MACs per inference: 12k (0.00417%) |
+| 45  | Conv_44              | convolutional         | 382     | 546     | 1x14x14x384  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 24k (0.716%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 46  | Clip_45              | neuron                | 546     | 385     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 47  | Conv_46              | convolutional         | 385     | 549     | 1x14x14x384  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 384                        |
+|     |                      |                       |         |         |              |          | param count: 3k (0.11%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 677k (0.225%)  |
+| 48  | Clip_47              | neuron                | 549     | 388     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 49  | Conv_48              | convolutional         | 388     | 552     | 1x14x14x64   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 64                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 24k (0.706%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 50  | Add_49               | elementwise_binary_op | 382     | 391     | 1x14x14x64   | A D G C  | operation: sum                     |
+|     |                      |                       | 552     |         |              |          | MACs per inference: 12k (0.00417%) |
+| 51  | Conv_50              | convolutional         | 391     | 555     | 1x14x14x384  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 24k (0.716%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 52  | Clip_51              | neuron                | 555     | 394     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 53  | Conv_52              | convolutional         | 394     | 558     | 1x14x14x384  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 384                        |
+|     |                      |                       |         |         |              |          | param count: 3k (0.11%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 677k (0.225%)  |
+| 54  | Clip_53              | neuron                | 558     | 397     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 55  | Conv_54              | convolutional         | 397     | 561     | 1x14x14x64   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 64                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 24k (0.706%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 56  | Add_55               | elementwise_binary_op | 391     | 400     | 1x14x14x64   | A D G C  | operation: sum                     |
+|     |                      |                       | 561     |         |              |          | MACs per inference: 12k (0.00417%) |
+| 57  | Conv_56              | convolutional         | 400     | 564     | 1x14x14x384  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 24k (0.716%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.6%)      |
+| 58  | Clip_57              | neuron                | 564     | 403     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 59  | Conv_58              | convolutional         | 403     | 567     | 1x14x14x384  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 384                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 384                        |
+|     |                      |                       |         |         |              |          | param count: 3k (0.11%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 677k (0.225%)  |
+| 60  | Clip_59              | neuron                | 567     | 406     | 1x14x14x384  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 61  | Conv_60              | convolutional         | 406     | 570     | 1x14x14x96   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 96                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 36k (1.06%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 7M (2.4%)      |
+| 62  | Conv_61              | convolutional         | 570     | 573     | 1x14x14x576  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 576                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 55k (1.6%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 63  | Clip_62              | neuron                | 573     | 411     | 1x14x14x576  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 64  | Conv_63              | convolutional         | 411     | 576     | 1x14x14x576  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 576                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 576                        |
+|     |                      |                       |         |         |              |          | param count: 5k (0.165%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 1M (0.338%)    |
+| 65  | Clip_64              | neuron                | 576     | 414     | 1x14x14x576  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 66  | Conv_65              | convolutional         | 414     | 579     | 1x14x14x96   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 96                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 55k (1.59%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 67  | Add_66               | elementwise_binary_op | 570     | 417     | 1x14x14x96   | A D G C  | operation: sum                     |
+|     |                      |                       | 579     |         |              |          | MACs per inference: 18k (0.00625%) |
+| 68  | Conv_67              | convolutional         | 417     | 582     | 1x14x14x576  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 576                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 55k (1.6%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 69  | Clip_68              | neuron                | 582     | 420     | 1x14x14x576  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 70  | Conv_69              | convolutional         | 420     | 585     | 1x14x14x576  | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 576                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 576                        |
+|     |                      |                       |         |         |              |          | param count: 5k (0.165%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 1M (0.338%)    |
+| 71  | Clip_70              | neuron                | 585     | 423     | 1x14x14x576  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 72  | Conv_71              | convolutional         | 423     | 588     | 1x14x14x96   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 96                    |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 55k (1.59%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 73  | Add_72               | elementwise_binary_op | 417     | 426     | 1x14x14x96   | A D G C  | operation: sum                     |
+|     |                      |                       | 588     |         |              |          | MACs per inference: 18k (0.00625%) |
+| 74  | Conv_73              | convolutional         | 426     | 591     | 1x14x14x576  | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 576                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 55k (1.6%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 10M (3.6%)     |
+| 75  | Clip_74              | neuron                | 591     | 429     | 1x14x14x576  | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 76  | Conv_75              | convolutional         | 429     | 594     | 1x7x7x576    | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 2                        |
+|     |                      |                       |         |         |              |          | stride y: 2                        |
+|     |                      |                       |         |         |              |          | num filters: 576                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 576                        |
+|     |                      |                       |         |         |              |          | param count: 5k (0.165%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 254k (0.0844%) |
+| 77  | Clip_76              | neuron                | 594     | 432     | 1x7x7x576    | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 78  | Conv_77              | convolutional         | 432     | 597     | 1x7x7x160    | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 160                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 92k (2.65%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 4M (1.5%)      |
+| 79  | Conv_78              | convolutional         | 597     | 600     | 1x7x7x960    | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 960                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 154k (4.43%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 7M (2.5%)      |
+| 80  | Clip_79              | neuron                | 600     | 437     | 1x7x7x960    | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 81  | Conv_80              | convolutional         | 437     | 603     | 1x7x7x960    | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 960                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 960                        |
+|     |                      |                       |         |         |              |          | param count: 9k (0.275%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 423k (0.141%)  |
+| 82  | Clip_81              | neuron                | 603     | 440     | 1x7x7x960    | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 83  | Conv_82              | convolutional         | 440     | 606     | 1x7x7x160    | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 160                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 153k (4.41%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 7M (2.5%)      |
+| 84  | Add_83               | elementwise_binary_op | 597     | 443     | 1x7x7x160    | A D G C  | operation: sum                     |
+|     |                      |                       | 606     |         |              |          | MACs per inference: 7k (0.0026%)   |
+| 85  | Conv_84              | convolutional         | 443     | 609     | 1x7x7x960    | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 960                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 154k (4.43%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 7M (2.5%)      |
+| 86  | Clip_85              | neuron                | 609     | 446     | 1x7x7x960    | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 87  | Conv_86              | convolutional         | 446     | 612     | 1x7x7x960    | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 960                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 960                        |
+|     |                      |                       |         |         |              |          | param count: 9k (0.275%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 423k (0.141%)  |
+| 88  | Clip_87              | neuron                | 612     | 449     | 1x7x7x960    | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 89  | Conv_88              | convolutional         | 449     | 615     | 1x7x7x160    | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 160                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 153k (4.41%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 7M (2.5%)      |
+| 90  | Add_89               | elementwise_binary_op | 443     | 452     | 1x7x7x160    | A D G C  | operation: sum                     |
+|     |                      |                       | 615     |         |              |          | MACs per inference: 7k (0.0026%)   |
+| 91  | Conv_90              | convolutional         | 452     | 618     | 1x7x7x960    | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 960                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 154k (4.43%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 7M (2.5%)      |
+| 92  | Clip_91              | neuron                | 618     | 455     | 1x7x7x960    | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 93  | Conv_92              | convolutional         | 455     | 621     | 1x7x7x960    | A D G C  | padding x: 1                       |
+|     |                      |                       |         |         |              |          | padding y: 1                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 960                   |
+|     |                      |                       |         |         |              |          | kernel: 3x3                        |
+|     |                      |                       |         |         |              |          | groups: 960                        |
+|     |                      |                       |         |         |              |          | param count: 9k (0.275%)           |
+|     |                      |                       |         |         |              |          | MACs per inference: 423k (0.141%)  |
+| 94  | Clip_93              | neuron                | 621     | 458     | 1x7x7x960    | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 95  | Conv_94              | convolutional         | 458     | 624     | 1x7x7x320    | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 320                   |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 307k (8.82%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 15M (5%)       |
+| 96  | Conv_95              | convolutional         | 624     | 627     | 1x7x7x1280   | A D G C  | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | padding mode: zero                 |
+|     |                      |                       |         |         |              |          | stride x: 1                        |
+|     |                      |                       |         |         |              |          | stride y: 1                        |
+|     |                      |                       |         |         |              |          | num filters: 1280                  |
+|     |                      |                       |         |         |              |          | kernel: 1x1                        |
+|     |                      |                       |         |         |              |          | param count: 410k (11.8%)          |
+|     |                      |                       |         |         |              |          | MACs per inference: 20M (6.67%)    |
+| 97  | Clip_96              | neuron                | 627     | 463     | 1x7x7x1280   | A D G C  | a: 0                               |
+|     |                      |                       |         |         |              |          | b: 0                               |
+|     |                      |                       |         |         |              |          | min_clamp: 0                       |
+|     |                      |                       |         |         |              |          | max_clamp: 6                       |
+|     |                      |                       |         |         |              |          | func: relu_min_max                 |
+| 98  | GlobalAveragePool_97 | pooling               | 463     | 464     | 1x1x1x1280   | A D G C  | pool size x: 7                     |
+|     |                      |                       |         |         |              |          | pool size y: 7                     |
+|     |                      |                       |         |         |              |          | stride x: 7                        |
+|     |                      |                       |         |         |              |          | stride y: 7                        |
+|     |                      |                       |         |         |              |          | padding x: 0                       |
+|     |                      |                       |         |         |              |          | padding y: 0                       |
+|     |                      |                       |         |         |              |          | pool_type: POOL_AVG                |
+|     |                      |                       |         |         |              |          | MACs per inference: 62k (0.0208%)  |
+| 99  | 464.ncs              | permute               | 464     | 464.ncs | 1x1280x1x1   | A D G C  | permute_order: [0, 3, 1, 2]        |
+| 100 | Gemm_104             | fully_connected       | 464.ncs | output  | 1x1000       | A D G C  | param count: 1M (36.7%)            |
+|     |                      |                       |         |         |              |          | MACs per inference: 1M (0.425%)    |
+-----------------------------------------------------------------------------------------------------------------------------------------
+Note: The supported runtimes column assumes a processor target of Snapdragon 835 (8998)
+Key : A:AIP
+      D:DSP
+      G:GPU
+      C:CPU
+
+Total parameters: 3487816 (13 MB assuming single precision float)
+Total MACs per inference: 301M (100%)
+Converter command: snpe-onnx-to-dlc adjust_nms_features_dims=False align_matmul_ranks=True copyright_file=None custom_op_config_paths=None debug=-1 disable_batchnorm_folding=False disable_chaining_eltwise_ops=False dry_run=None dumpIR=False dump_inferred_model=False dump_value_info=False enable_strict_validation=False extract_color_transform=False force_prune_cast_ops=True handle_gather_negative_indices=False inject_cast_for_gather=False input_dim=[['input', '1,3,224,224']] input_dtype=[] input_encoding=[] input_layout=[] input_type=[['input', 'image']] keep_disconnected_nodes=False keep_quant_nodes=False match_caffe_ssd_to_tf=False model_version=None no_simplification=False out_names=['output'] perform_axes_to_spatial_first_order=True prepare_inputs_as_params=True preprocess_lstm_ops=False preprocess_roi_pool_inputs=False quantization_overrides= squash_box_decoder=False unroll_lstm_time_steps=False use_convert_quantization_nodes=True validation_target=[]
+Quantizer command: N/A
+DLC created with converter version: 1.61.0.3358
+Layers used by DLC: CONVOLUTIONAL, DATA, ELEMENTWISE_BINARY_OP_SUM, FULLY_CONNECTED, NEURON_RELU_MIN_MAX, PERMUTE, POOLING
+Est. Steady-State Memory Needed to Run: 164.3 MiB
+-----------------------------------------------------------------------------------------------------------------------------------------
+
--- a/extra/dsp/snpe_logs/high_perf_2
+++ b/extra/dsp/snpe_logs/high_perf_2
@@ -0,0 +1,131 @@
+Log File Created: Tue Mar 18 01:33:12 2025
+Time Scale: 1e-06
+Epoch Timestamp: 1742286792883569 Steady Clock Timestamp: 75586845756
+Software library version: 1.61.0.3358
+
+Dnn Runtime Load/Deserialize/Create/De-Init Statistics:
+--------------------------------------------------
+Load: 333 us
+Deserialize: 32452 us
+Create: 143084 us
+
+Init: 178071 us
+De-Init: 16710 us
+
+Create Network(s): 86850 us
+RPC Init Time: 43213 us
+Snpe Accelerator Init Time: 42154 us
+Accelerator Init Time: 39189 us
+
+Average SNPE Statistics:
+------------------------------
+Total Inference Time: 11868 us
+Forward Propagate Time: 11816 us
+RPC Execute Time: 9810 us
+Snpe Accelerator Time: 9129 us
+Accelerator Time: 8701 us
+Misc Accelerator Time: 10 us
+
+Layer Times: 
+---------------
+0: 42 us : DSP
+1: 0 us : DSP
+2: 254 us : DSP
+3: 0 us : DSP
+4: 153 us : DSP
+5: 295 us : DSP
+6: 0 us : DSP
+7: 287 us : DSP
+8: 0 us : DSP
+9: 162 us : DSP
+10: 210 us : DSP
+11: 0 us : DSP
+12: 138 us : DSP
+13: 0 us : DSP
+14: 176 us : DSP
+15: 293 us : DSP
+16: 60 us : DSP
+17: 0 us : DSP
+18: 157 us : DSP
+19: 0 us : DSP
+20: 112 us : DSP
+21: 134 us : DSP
+22: 0 us : DSP
+23: 81 us : DSP
+24: 0 us : DSP
+25: 104 us : DSP
+26: 130 us : DSP
+27: 37 us : DSP
+28: 0 us : DSP
+29: 81 us : DSP
+30: 0 us : DSP
+31: 87 us : DSP
+32: 124 us : DSP
+33: 30 us : DSP
+34: 0 us : DSP
+35: 87 us : DSP
+36: 0 us : DSP
+37: 63 us : DSP
+38: 74 us : DSP
+39: 0 us : DSP
+40: 102 us : DSP
+41: 0 us : DSP
+42: 82 us : DSP
+43: 95 us : DSP
+44: 29 us : DSP
+45: 0 us : DSP
+46: 112 us : DSP
+47: 0 us : DSP
+48: 88 us : DSP
+49: 96 us : DSP
+50: 25 us : DSP
+51: 0 us : DSP
+52: 103 us : DSP
+53: 0 us : DSP
+54: 80 us : DSP
+55: 100 us : DSP
+56: 26 us : DSP
+57: 0 us : DSP
+58: 102 us : DSP
+59: 0 us : DSP
+60: 85 us : DSP
+61: 129 us : DSP
+62: 0 us : DSP
+63: 155 us : DSP
+64: 0 us : DSP
+65: 113 us : DSP
+66: 194 us : DSP
+67: 34 us : DSP
+68: 0 us : DSP
+69: 157 us : DSP
+70: 0 us : DSP
+71: 120 us : DSP
+72: 198 us : DSP
+73: 34 us : DSP
+74: 0 us : DSP
+75: 155 us : DSP
+76: 0 us : DSP
+77: 101 us : DSP
+78: 121 us : DSP
+79: 0 us : DSP
+80: 256 us : DSP
+81: 0 us : DSP
+82: 134 us : DSP
+83: 159 us : DSP
+84: 31 us : DSP
+85: 0 us : DSP
+86: 199 us : DSP
+87: 0 us : DSP
+88: 142 us : DSP
+89: 152 us : DSP
+90: 26 us : DSP
+91: 0 us : DSP
+92: 202 us : DSP
+93: 0 us : DSP
+94: 143 us : DSP
+95: 278 us : DSP
+96: 0 us : DSP
+97: 316 us : DSP
+98: 40 us : DSP
+99: 12 us : DSP
+100: 199 us : DSP
--- a/extra/dsp/snpe_logs/parse.py
+++ b/extra/dsp/snpe_logs/parse.py
@@ -0,0 +1,21 @@
+di = open("dlc_info_2").read().split("\n")
+layers = {}
+for l in di:
+  if not l.startswith("| "): continue
+  if l.startswith("|     |"): continue
+  ll = [x.strip() for x in l.split("|")]
+  if ll[1] == "Id": continue
+  layers[int(ll[1])] = (ll[2], ll[6])
+hp = open("high_perf_2").read().split("Layer Times:")[1].strip().split("\n")[2:]
+sl = 1
+tms = 0
+for l in hp:
+  kk, tm, _ = l.split(" ", 2)
+  tm = int(tm)
+  lnum = int(kk.strip(":"))
+  if int(tm) != 0:
+    print(f"{sl:2d} {tm:4d} us {layers[lnum]}")
+    tms += tm
+    sl += 1
+print(f"total time, {tms/1000:.2f} ms")
+
--- a/extra/onnx.py
+++ b/extra/onnx.py
@@ -728,9 +728,12 @@ def get_onnx_ops():
    y_scale, y_zero_point = _prepare_quantize(x, y_scale, y_zero_point, axis, block_size)
    if out_dtype == dtypes.uchar:
      # this appears to work in practice, at least for uchar out_dtype. it folds with the quantize stuff
-      return _clamp_cast((x / y_scale + 0.4999999 + y_zero_point).int(), out_dtype).contiguous()
+      ret = _clamp_cast((x / y_scale + 0.4999999 + y_zero_point).int(), out_dtype)
    else:
-      return _clamp_cast(((x / y_scale).round() + y_zero_point), out_dtype).contiguous()
+      ret = _clamp_cast(((x / y_scale).round() + y_zero_point), out_dtype)
+    # you need both NHWC=1 DONT_GROUP_REDUCES=1 for this to work
+    if getenv("NHWC") and len(ret.shape) == 4: return ret.permute(0,2,3,1).contiguous().permute(0,3,1,2)
+    return ret.contiguous()

  def DynamicQuantizeLinear(x: Tensor):
    # only support uint8
--- a/extra/replay_pkl.py
+++ b/extra/replay_pkl.py
@@ -23,6 +23,8 @@ if __name__ == "__main__":
        p: ProgramSpec = ei.prg.p
        k = Kernel(p.ast, Device["DSP"].renderer)
        if not getenv("NOOPT"):
+          # only NCHW
+          """
          if knum in [6,7,9,11]:
            k.apply_opt(Opt(OptOps.PADTO, 1, 128))
            k.apply_opt(Opt(OptOps.UPCAST, 1, 128))
@@ -48,6 +50,15 @@ if __name__ == "__main__":
            k.apply_opt(Opt(OptOps.UPCAST, 1, 128))
          else:
            k.hand_coded_optimizations()
+          """
+          if knum == 3:
+            k.apply_opt(Opt(OptOps.UNROLL, 0, 0))
+            k.apply_opt(Opt(OptOps.UPCAST, 1, 16))
+            k.apply_opt(Opt(OptOps.UPCAST, 0, 128//16))
+            #k.apply_opt(Opt(OptOps.UPCAST, 0, 8))
+            pass
+          else:
+            k.hand_coded_optimizations()
          #if knum in [5]: k.apply_opt(Opt(OptOps.UPCAST, 1, 2))
        p2 = k.to_program()
        new_ei = replace(ei, prg=CompiledRunner(p2), bufs=[Buffer("DSP", 1024+b.size*2, b.dtype).view(b.size, b.dtype, 512) for b in ei.bufs])