Files
tinygrad/openpilot/run_thneed.py
George Hotz c400ee0beb refactoring thneed (#400)
* refactoring thneed

* continue

* minor update

* looks like it's working

* big refactor

* confirm thneed got the right output

* code is there but it's broken

* works now

* always OPTWG, input -> dat

* fix type issue
2022-10-20 12:35:59 -07:00

304 lines
10 KiB
Python

#!/usr/bin/env python3
import os
import sys
import time
import struct
import numpy as np
BASEDIR = os.path.dirname(os.path.abspath(__file__))+"/"
THNEED_KERNELS = "../../selfdrive/modeld/thneed/kernels/"
def load_thneed_model(fn="model.thneed", float32=False, replace=None):
import pyopencl as cl
devices = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
if len(devices) == 0: # settle for CPU
devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], [])
ctx = cl.Context(devices=devices[0:1])
q = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
mf = cl.mem_flags
image_fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT if float32 else cl.channel_type.HALF_FLOAT)
image_fmt_32 = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
import json
import traceback
with open(fn if (fn[0] == "/") else (BASEDIR+"../models/"+fn), "rb") as f:
json_len = struct.unpack("I", f.read(4))[0]
jdat = json.loads(f.read(json_len).decode('latin_1'))
weights = f.read()
# jdat = ['kernels', 'objects', 'programs']
prgs = {}
for k,v in jdat['programs'].items():
print("building", k)
try:
prgs[k] = cl.Program(ctx, v).build().__getattr__(k)
except Exception:
print("FAILED", k)
traceback.print_exc()
exit(0)
bufs = {'\x00\x00\x00\x00\x00\x00\x00\x00': None}
bufs_loaded = {}
ptr = 0
for o in jdat['objects']:
#print(o)
if o['needs_load']:
nptr = ptr + o['size']
o['data'] = weights[ptr:nptr]
ptr = nptr
if o['arg_type'] == "image2d_t" or o['arg_type'] == "image1d_t":
tfmt = image_fmt_32 if 'float32' in o and o['float32'] else image_fmt
if o['arg_type'] == "image2d_t":
if 'buffer_id' in o and o['height'] == 1 and not bufs_loaded[o['buffer_id']]:
# hack: use a image1d since we can back that with a buffer
buf = cl.Image(ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
else:
# buffer isn't supported in image2d, copy buffer into image
if 'buffer_id' in o and bufs_loaded[o['buffer_id']]:
arr = np.zeros(bufs[o['buffer_id']].size // 2, dtype=np.float16)
cl.enqueue_copy(q, arr, bufs[o['buffer_id']])
buf = cl.Image(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=arr)
elif o['needs_load']:
buf = cl.Image(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=o['data'])
else:
buf = cl.Image(ctx, mf.READ_WRITE, tfmt, shape=(o['width'], o['height']))
if o['arg_type'] == "image1d_t":
assert not o['needs_load']
assert not bufs_loaded[o['buffer_id']]
buf = cl.Image(ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
else:
if 'data' in o:
buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=o['data'])
else:
# zero out buffers
buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size']*(2 if float32 else 1))
bufs[o['id']] = buf
bufs_loaded[o['id']] = 'data' in o
# load binaries
for o in jdat['binaries']:
nptr = ptr + o['length']
prgs[o['name']] = cl.Program(ctx, ctx.devices, [weights[ptr:nptr]]).build().__getattr__(o['name'])
ptr = nptr
inputs, vnum, vision, outputs = [], [], [], []
for k in jdat['inputs'] if 'inputs' in jdat else []:
print(f"new style input {k['name']} with size {k['size']}")
inputs.append(bufs[k['buffer_id']])
for k in jdat['outputs'] if 'outputs' in jdat else []:
outputs.append(bufs[k['buffer_id']])
# old style inputs
for i,k in enumerate(jdat['kernels']):
if k['name'] == 'zero_pad_image_float':
inputs.append(bufs[k['args'][1]])
# vision model
if k['name'] == 'zero_pad_image_half':
vision.append(bufs[k['args'][1]])
vnum.append(i)
if k['name'] == 'image2d_to_buffer_float':
outputs.append(bufs[k['args'][2]])
k['args_name'] = []
prg = prgs[k['name']]
for i,arg in enumerate(k['args']):
try:
k['args_name'].append(prg.get_arg_info(i, cl.kernel_arg_info.NAME))
except cl.RuntimeError:
k['args_name'].append("<UNKNOWN>")
vision = vision[0:1]
vnum = vnum[0] if len(vnum) >= 1 else None
def runner(inp=[], policy_only=False, vision_only=False, debug=False):
kernels = []
total_runtime = 0
# [2048, 8, 32, 1572864]
real_inputs = inputs[0:3]+vision if policy_only else inputs
for a,b in zip(real_inputs, inp):
if debug:
print(a.size, b.size*b.itemsize)
#assert a.size == (b.size * b.itemsize) or float32
cl.enqueue_copy(q, a, np.array(b, dtype=np.float16 if len(vision) > 0 and a == vision[0] else np.float32))
#jdat['kernels'] = jdat['kernels'][0:8]
seen_output = set(real_inputs)
for k in jdat['kernels'][0:3]+jdat['kernels'][vnum:] if policy_only else (jdat['kernels'][:vnum] if vision_only else jdat['kernels']):
kernel = prgs[k['name']]
aaa = []
has_local = False
for i,(a,sz) in enumerate(zip(k['args'], k['args_size'])):
arg_name = k['args_name'][i]
if len(a) == 0:
aa = cl.LocalMemory(sz)
has_local = True
elif len(a) == 4:
a = a.encode('latin_1')
aa = np.uint32(struct.unpack("I", a)[0])
elif len(a) == 2:
a = a.encode('latin_1')
aa = np.uint16(struct.unpack("H", a)[0])
elif len(a) == 8:
aa = bufs[a]
if debug:
#print(f" {arg_name:20s} : {aa}")
if arg_name == "output":
seen_output.add(aa)
if arg_name == "input":
if aa not in seen_output:
print("ERROR", aa, "is not seen in output")
aaa.append(aa)
"""
for a in aaa:
types = {cl.mem_object_type.IMAGE1D_BUFFER: "IMAGE1D_BUFFER", cl.mem_object_type.IMAGE2D: "IMAGE2D", cl.mem_object_type.IMAGE1D: "IMAGE1D"}
if isinstance(a, cl.Image):
if a.type == cl.mem_object_type.IMAGE2D:
print(" ", a, types[a.type], a.shape)
elif a.type == cl.mem_object_type.IMAGE1D_BUFFER:
print(" ", a, types[a.type], a.size)
else:
print(" ", a, types[a.type])
elif isinstance(a, cl.Buffer):
print(" ", a, a.size)
elif isinstance(a, cl.LocalMemory):
print(" ", a, a.size)
else:
print(" ", a)
"""
if has_local:
e = kernel(q, k['global_work_size'], k['local_work_size'], *aaa)
else:
e = kernel(q, k['global_work_size'], None, *aaa)
kernels.append((k,e))
#if k['name'] == 'zero_pad_image_float':
#arr = np.zeros((aaa[1].size//4), dtype=np.float32)
#cl.enqueue_copy(q, arr, aaa[1])
"""
if k['name'] == "convolution_horizontal_reduced_reads":
print(aaa)
return dump_image(ctx, q, aaa[0]), dump_image(ctx, q, aaa[6]), dump_image(ctx, q, aaa[10])
"""
"""
if isinstance(aaa[0], cl.Image):
dump_image(ctx, q, aaa[0])
if k['name'] == "convolution_horizontal_reduced_reads":
dump_image(ctx, q, aaa[6])
"""
#q.finish()
q.finish()
for k,e in kernels:
print("%-60s" % k['name'], f"{str(k['global_work_size']):20s} {str(k['local_work_size']):20s} {(e.profile.end - e.profile.start)/1e3:9.2f} us")
total_runtime += e.profile.end - e.profile.start
print(f"total runtime: {total_runtime/1e6:.2f} ms")
if len(outputs) == 0: return
if vision_only:
output = vision[0]
ret = np.zeros(output.size//2, dtype=np.float16)
else:
output = outputs[0]
ret = np.zeros(output.size//4, dtype=np.float32)
cl.enqueue_copy(q, ret, output)
if float32:
return ret[:len(ret)//2]
else:
return ret
return runner
if __name__ == "__main__":
runner = load_thneed_model("/data/openpilot/selfdrive/modeld/models/supercombo.thneed" if len(sys.argv) == 1 else sys.argv[1], float32=bool(int(os.getenv("FLOAT32", "0"))))
np.random.seed(1338)
np_inputs = {
"input_imgs": np.random.randn(*(1, 12, 128, 256))*256,
"big_input_imgs": np.random.randn(*(1, 12, 128, 256))*256,
"desire": np.zeros((1, 100, 8)),
"traffic_convention": np.array([[1., 0.]]),
"features_buffer": np.random.randn(*(1, 99, 128))
}
np_inputs = {k:v.astype(np.float32) for k,v in np_inputs.items()}
inputs = list(np_inputs.values())[::-1]
ret = runner(inputs, vision_only=False, debug=True)
print(ret.shape)
if len(sys.argv) > 2:
print("comparing to ONNX")
from test.test_onnx import run_onnx_torch
from extra.utils import fetch
import onnx, io
dat = fetch(sys.argv[2])
onnx_model = onnx.load(io.BytesIO(dat))
for inp, numpy_name in zip(onnx_model.graph.input, np_inputs.keys()):
assert inp.name == numpy_name, f"name mismatch {inp.name} {numpy_name}"
out = run_onnx_torch(onnx_model, np_inputs).numpy()[0]
diff = 0
diffs = []
for i in range(ret.shape[0]):
if abs(out[i]-ret[i]) > 0.1 and abs((out[i]-ret[i])/out[i]) > 0.01:
diff += 1
diffs.append(out[i] - ret[i])
if diff == 10:
print("...")
elif diff < 10:
print(i, out[i], ret[i], out[i]-ret[i])
if len(diffs) > 0:
print("%d differences min: %f max: %f" % (diff, min(diffs), max(diffs)))
assert diff == 0
"""
for i in range(0, len(ret), 0x10):
p = []
for j in ret[i:i+0x10]:
p.append("%6.2f " % j)
print("%5d" % i + ''.join(p))
"""
exit(0)
#test_dat = [open("/home/batman/openpilot/xx/tools/snpe/compile_test_data/dlc_input_%d" % i, "rb").read() for i in range(4)]
#cl.enqueue_copy(q, inputs[3], test_dat[0])
for i in range(5):
st = time.time()
ret = runner()
et = time.time()
print(ret.shape, ret, (et-st)*1000.)
exit(0)
print([x.size for x in inputs])
print("**************", outputs)
output = outputs[0]
#print(dir(output))
#print(output.buffer)
ret = np.zeros(output.size//4, dtype=np.float32)
if output.type == cl.mem_object_type.IMAGE2D:
cl.enqueue_copy(q, ret, output, origin=(0,0), region=output.shape)
else:
cl.enqueue_copy(q, ret, output)
#cl.enqueue_copy(q, ret, output.buffer)
#for i in range(0, 32, 16):
# print(ret[i:i+0x10])
print(ret.shape, ret)