mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-23 22:08:08 -05:00
* refactoring thneed * continue * minor update * looks like it's working * big refactor * confirm thneed got the right output * code is there but it's broken * works now * always OPTWG, input -> dat * fix type issue
304 lines
10 KiB
Python
304 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
import os
|
|
import sys
|
|
import time
|
|
import struct
|
|
import numpy as np
|
|
|
|
BASEDIR = os.path.dirname(os.path.abspath(__file__))+"/"
|
|
THNEED_KERNELS = "../../selfdrive/modeld/thneed/kernels/"
|
|
|
|
def load_thneed_model(fn="model.thneed", float32=False, replace=None):
|
|
import pyopencl as cl
|
|
devices = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
|
|
if len(devices) == 0: # settle for CPU
|
|
devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], [])
|
|
ctx = cl.Context(devices=devices[0:1])
|
|
q = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
|
|
mf = cl.mem_flags
|
|
image_fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT if float32 else cl.channel_type.HALF_FLOAT)
|
|
image_fmt_32 = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
|
|
|
|
import json
|
|
import traceback
|
|
with open(fn if (fn[0] == "/") else (BASEDIR+"../models/"+fn), "rb") as f:
|
|
json_len = struct.unpack("I", f.read(4))[0]
|
|
jdat = json.loads(f.read(json_len).decode('latin_1'))
|
|
weights = f.read()
|
|
|
|
# jdat = ['kernels', 'objects', 'programs']
|
|
prgs = {}
|
|
for k,v in jdat['programs'].items():
|
|
print("building", k)
|
|
try:
|
|
prgs[k] = cl.Program(ctx, v).build().__getattr__(k)
|
|
except Exception:
|
|
print("FAILED", k)
|
|
traceback.print_exc()
|
|
exit(0)
|
|
|
|
bufs = {'\x00\x00\x00\x00\x00\x00\x00\x00': None}
|
|
bufs_loaded = {}
|
|
ptr = 0
|
|
for o in jdat['objects']:
|
|
#print(o)
|
|
if o['needs_load']:
|
|
nptr = ptr + o['size']
|
|
o['data'] = weights[ptr:nptr]
|
|
ptr = nptr
|
|
|
|
if o['arg_type'] == "image2d_t" or o['arg_type'] == "image1d_t":
|
|
tfmt = image_fmt_32 if 'float32' in o and o['float32'] else image_fmt
|
|
if o['arg_type'] == "image2d_t":
|
|
if 'buffer_id' in o and o['height'] == 1 and not bufs_loaded[o['buffer_id']]:
|
|
# hack: use a image1d since we can back that with a buffer
|
|
buf = cl.Image(ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
|
|
else:
|
|
# buffer isn't supported in image2d, copy buffer into image
|
|
if 'buffer_id' in o and bufs_loaded[o['buffer_id']]:
|
|
arr = np.zeros(bufs[o['buffer_id']].size // 2, dtype=np.float16)
|
|
cl.enqueue_copy(q, arr, bufs[o['buffer_id']])
|
|
buf = cl.Image(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
|
|
shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=arr)
|
|
elif o['needs_load']:
|
|
buf = cl.Image(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
|
|
shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=o['data'])
|
|
else:
|
|
buf = cl.Image(ctx, mf.READ_WRITE, tfmt, shape=(o['width'], o['height']))
|
|
if o['arg_type'] == "image1d_t":
|
|
assert not o['needs_load']
|
|
assert not bufs_loaded[o['buffer_id']]
|
|
buf = cl.Image(ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
|
|
else:
|
|
if 'data' in o:
|
|
buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=o['data'])
|
|
else:
|
|
# zero out buffers
|
|
buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size']*(2 if float32 else 1))
|
|
|
|
bufs[o['id']] = buf
|
|
bufs_loaded[o['id']] = 'data' in o
|
|
|
|
# load binaries
|
|
for o in jdat['binaries']:
|
|
nptr = ptr + o['length']
|
|
prgs[o['name']] = cl.Program(ctx, ctx.devices, [weights[ptr:nptr]]).build().__getattr__(o['name'])
|
|
ptr = nptr
|
|
|
|
inputs, vnum, vision, outputs = [], [], [], []
|
|
|
|
for k in jdat['inputs'] if 'inputs' in jdat else []:
|
|
print(f"new style input {k['name']} with size {k['size']}")
|
|
inputs.append(bufs[k['buffer_id']])
|
|
for k in jdat['outputs'] if 'outputs' in jdat else []:
|
|
outputs.append(bufs[k['buffer_id']])
|
|
|
|
# old style inputs
|
|
for i,k in enumerate(jdat['kernels']):
|
|
if k['name'] == 'zero_pad_image_float':
|
|
inputs.append(bufs[k['args'][1]])
|
|
|
|
# vision model
|
|
if k['name'] == 'zero_pad_image_half':
|
|
vision.append(bufs[k['args'][1]])
|
|
vnum.append(i)
|
|
|
|
if k['name'] == 'image2d_to_buffer_float':
|
|
outputs.append(bufs[k['args'][2]])
|
|
|
|
k['args_name'] = []
|
|
prg = prgs[k['name']]
|
|
for i,arg in enumerate(k['args']):
|
|
try:
|
|
k['args_name'].append(prg.get_arg_info(i, cl.kernel_arg_info.NAME))
|
|
except cl.RuntimeError:
|
|
k['args_name'].append("<UNKNOWN>")
|
|
|
|
vision = vision[0:1]
|
|
vnum = vnum[0] if len(vnum) >= 1 else None
|
|
|
|
def runner(inp=[], policy_only=False, vision_only=False, debug=False):
|
|
kernels = []
|
|
total_runtime = 0
|
|
# [2048, 8, 32, 1572864]
|
|
real_inputs = inputs[0:3]+vision if policy_only else inputs
|
|
for a,b in zip(real_inputs, inp):
|
|
if debug:
|
|
print(a.size, b.size*b.itemsize)
|
|
#assert a.size == (b.size * b.itemsize) or float32
|
|
cl.enqueue_copy(q, a, np.array(b, dtype=np.float16 if len(vision) > 0 and a == vision[0] else np.float32))
|
|
|
|
#jdat['kernels'] = jdat['kernels'][0:8]
|
|
|
|
seen_output = set(real_inputs)
|
|
for k in jdat['kernels'][0:3]+jdat['kernels'][vnum:] if policy_only else (jdat['kernels'][:vnum] if vision_only else jdat['kernels']):
|
|
kernel = prgs[k['name']]
|
|
aaa = []
|
|
has_local = False
|
|
for i,(a,sz) in enumerate(zip(k['args'], k['args_size'])):
|
|
arg_name = k['args_name'][i]
|
|
if len(a) == 0:
|
|
aa = cl.LocalMemory(sz)
|
|
has_local = True
|
|
elif len(a) == 4:
|
|
a = a.encode('latin_1')
|
|
aa = np.uint32(struct.unpack("I", a)[0])
|
|
elif len(a) == 2:
|
|
a = a.encode('latin_1')
|
|
aa = np.uint16(struct.unpack("H", a)[0])
|
|
elif len(a) == 8:
|
|
aa = bufs[a]
|
|
if debug:
|
|
#print(f" {arg_name:20s} : {aa}")
|
|
if arg_name == "output":
|
|
seen_output.add(aa)
|
|
if arg_name == "input":
|
|
if aa not in seen_output:
|
|
print("ERROR", aa, "is not seen in output")
|
|
aaa.append(aa)
|
|
|
|
"""
|
|
for a in aaa:
|
|
types = {cl.mem_object_type.IMAGE1D_BUFFER: "IMAGE1D_BUFFER", cl.mem_object_type.IMAGE2D: "IMAGE2D", cl.mem_object_type.IMAGE1D: "IMAGE1D"}
|
|
if isinstance(a, cl.Image):
|
|
if a.type == cl.mem_object_type.IMAGE2D:
|
|
print(" ", a, types[a.type], a.shape)
|
|
elif a.type == cl.mem_object_type.IMAGE1D_BUFFER:
|
|
print(" ", a, types[a.type], a.size)
|
|
else:
|
|
print(" ", a, types[a.type])
|
|
elif isinstance(a, cl.Buffer):
|
|
print(" ", a, a.size)
|
|
elif isinstance(a, cl.LocalMemory):
|
|
print(" ", a, a.size)
|
|
else:
|
|
print(" ", a)
|
|
"""
|
|
|
|
if has_local:
|
|
e = kernel(q, k['global_work_size'], k['local_work_size'], *aaa)
|
|
else:
|
|
e = kernel(q, k['global_work_size'], None, *aaa)
|
|
|
|
kernels.append((k,e))
|
|
|
|
#if k['name'] == 'zero_pad_image_float':
|
|
#arr = np.zeros((aaa[1].size//4), dtype=np.float32)
|
|
#cl.enqueue_copy(q, arr, aaa[1])
|
|
|
|
"""
|
|
if k['name'] == "convolution_horizontal_reduced_reads":
|
|
print(aaa)
|
|
return dump_image(ctx, q, aaa[0]), dump_image(ctx, q, aaa[6]), dump_image(ctx, q, aaa[10])
|
|
"""
|
|
|
|
"""
|
|
if isinstance(aaa[0], cl.Image):
|
|
dump_image(ctx, q, aaa[0])
|
|
if k['name'] == "convolution_horizontal_reduced_reads":
|
|
dump_image(ctx, q, aaa[6])
|
|
"""
|
|
|
|
#q.finish()
|
|
|
|
q.finish()
|
|
for k,e in kernels:
|
|
print("%-60s" % k['name'], f"{str(k['global_work_size']):20s} {str(k['local_work_size']):20s} {(e.profile.end - e.profile.start)/1e3:9.2f} us")
|
|
total_runtime += e.profile.end - e.profile.start
|
|
print(f"total runtime: {total_runtime/1e6:.2f} ms")
|
|
|
|
if len(outputs) == 0: return
|
|
if vision_only:
|
|
output = vision[0]
|
|
ret = np.zeros(output.size//2, dtype=np.float16)
|
|
else:
|
|
output = outputs[0]
|
|
ret = np.zeros(output.size//4, dtype=np.float32)
|
|
cl.enqueue_copy(q, ret, output)
|
|
if float32:
|
|
return ret[:len(ret)//2]
|
|
else:
|
|
return ret
|
|
|
|
return runner
|
|
|
|
if __name__ == "__main__":
|
|
runner = load_thneed_model("/data/openpilot/selfdrive/modeld/models/supercombo.thneed" if len(sys.argv) == 1 else sys.argv[1], float32=bool(int(os.getenv("FLOAT32", "0"))))
|
|
|
|
np.random.seed(1338)
|
|
np_inputs = {
|
|
"input_imgs": np.random.randn(*(1, 12, 128, 256))*256,
|
|
"big_input_imgs": np.random.randn(*(1, 12, 128, 256))*256,
|
|
"desire": np.zeros((1, 100, 8)),
|
|
"traffic_convention": np.array([[1., 0.]]),
|
|
"features_buffer": np.random.randn(*(1, 99, 128))
|
|
}
|
|
np_inputs = {k:v.astype(np.float32) for k,v in np_inputs.items()}
|
|
inputs = list(np_inputs.values())[::-1]
|
|
|
|
ret = runner(inputs, vision_only=False, debug=True)
|
|
print(ret.shape)
|
|
|
|
if len(sys.argv) > 2:
|
|
print("comparing to ONNX")
|
|
from test.test_onnx import run_onnx_torch
|
|
from extra.utils import fetch
|
|
import onnx, io
|
|
dat = fetch(sys.argv[2])
|
|
onnx_model = onnx.load(io.BytesIO(dat))
|
|
for inp, numpy_name in zip(onnx_model.graph.input, np_inputs.keys()):
|
|
assert inp.name == numpy_name, f"name mismatch {inp.name} {numpy_name}"
|
|
out = run_onnx_torch(onnx_model, np_inputs).numpy()[0]
|
|
|
|
diff = 0
|
|
diffs = []
|
|
for i in range(ret.shape[0]):
|
|
if abs(out[i]-ret[i]) > 0.1 and abs((out[i]-ret[i])/out[i]) > 0.01:
|
|
diff += 1
|
|
diffs.append(out[i] - ret[i])
|
|
if diff == 10:
|
|
print("...")
|
|
elif diff < 10:
|
|
print(i, out[i], ret[i], out[i]-ret[i])
|
|
if len(diffs) > 0:
|
|
print("%d differences min: %f max: %f" % (diff, min(diffs), max(diffs)))
|
|
assert diff == 0
|
|
|
|
"""
|
|
for i in range(0, len(ret), 0x10):
|
|
p = []
|
|
for j in ret[i:i+0x10]:
|
|
p.append("%6.2f " % j)
|
|
print("%5d" % i + ''.join(p))
|
|
"""
|
|
exit(0)
|
|
|
|
#test_dat = [open("/home/batman/openpilot/xx/tools/snpe/compile_test_data/dlc_input_%d" % i, "rb").read() for i in range(4)]
|
|
#cl.enqueue_copy(q, inputs[3], test_dat[0])
|
|
|
|
for i in range(5):
|
|
st = time.time()
|
|
ret = runner()
|
|
et = time.time()
|
|
print(ret.shape, ret, (et-st)*1000.)
|
|
exit(0)
|
|
|
|
print([x.size for x in inputs])
|
|
print("**************", outputs)
|
|
output = outputs[0]
|
|
|
|
#print(dir(output))
|
|
#print(output.buffer)
|
|
|
|
ret = np.zeros(output.size//4, dtype=np.float32)
|
|
if output.type == cl.mem_object_type.IMAGE2D:
|
|
cl.enqueue_copy(q, ret, output, origin=(0,0), region=output.shape)
|
|
else:
|
|
cl.enqueue_copy(q, ret, output)
|
|
#cl.enqueue_copy(q, ret, output.buffer)
|
|
#for i in range(0, 32, 16):
|
|
# print(ret[i:i+0x10])
|
|
print(ret.shape, ret)
|
|
|
|
|