From f83d715f41f964906bd30ef4855cb11e6a06733e Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 10 Dec 2024 06:21:42 +0800 Subject: [PATCH] move checks into compile3, delete compile2 [pr] (#8127) * move checks into compile3 [pr] * test_vs_onnx * test v torch works * float16 won't compile on compile3 * actually delete compile2 --- .github/workflows/benchmark.yml | 4 - .github/workflows/test.yml | 15 +-- examples/openpilot/compile2.py | 211 -------------------------------- examples/openpilot/compile3.py | 65 +++++++--- examples/openpilot/go.sh | 2 - 5 files changed, 55 insertions(+), 242 deletions(-) delete mode 100644 examples/openpilot/compile2.py delete mode 100755 examples/openpilot/go.sh diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a766eef1ca..442444a57f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -508,10 +508,6 @@ jobs: rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal - name: reset process replay run: test/external/process_replay/reset.py - - name: openpilot compile 0.9.4 - run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python examples/openpilot/compile2.py | tee openpilot_compile_0_9_4.txt - - name: openpilot compile 0.9.7 - run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python examples/openpilot/compile2.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_compile_0_9_7.txt - name: validate openpilot 0.9.7 run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt - name: benchmark openpilot 0.9.4 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3ef10566a1..f17b70ce8f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -293,22 +293,15 @@ jobs: PYTHONPATH="." GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20 PYTHONPATH="." GPU=1 IMAGE=2 python3 test/models/test_end2end.py TestEnd2End.test_linear_mnist - if: ${{ matrix.task == 'optimage' }} - name: Test openpilot model compile and size + name: Test openpilot model kernel count and gate usage run: | - PYTHONPATH="." DEBUG=2 ALLOWED_KERNEL_COUNT=208 ALLOWED_GATED_READ_IMAGE=13 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py - python -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000' - - if: ${{ matrix.task == 'optimage' }} - name: Test openpilot model correctness (float32) - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py - - if: ${{ matrix.task == 'optimage' }} - name: Test openpilot compile3 - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py + PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_GATED_READ_IMAGE=13 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'optimage' }} name: Test openpilot alt model correctness (float32) - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx + run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'optimage' }} name: Test openpilot fastvits model correctness (float32) - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx + run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'onnx' }} name: Test ONNX (GPU) run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 diff --git a/examples/openpilot/compile2.py b/examples/openpilot/compile2.py deleted file mode 100644 index d862bf203c..0000000000 --- a/examples/openpilot/compile2.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python3 -import os, sys, io, pathlib, json, struct -import numpy as np -sys.path.insert(0, str(pathlib.Path(__file__).parents[1])) - -if "FLOAT16" not in os.environ: os.environ["FLOAT16"] = "1" -if "IMAGE" not in os.environ: os.environ["IMAGE"] = "2" -if "NOLOCALS" not in os.environ: os.environ["NOLOCALS"] = "1" - -OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx" - -import onnx -from typing import Tuple, List, Optional, Dict, cast -from extra.onnx import get_run_onnx -from tinygrad import Tensor, Device, GlobalCounters, dtypes -from tinygrad.dtype import ImageDType -from tinygrad.device import Buffer -from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG, tqdm -from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem, CompiledRunner -from tinygrad.engine.memory import memory_planner -from tinygrad.engine.schedule import ScheduleItem, create_schedule -from tinygrad.ops import Ops -from tinygrad.tensor import _to_np_dtype -Device.DEFAULT = "GPU" - -def get_schedule(onnx_data) -> Tuple[List[ScheduleItem], List[ScheduleItem]]: - Tensor.no_grad = True - Tensor.training = False - - # load the model - onnx_model = onnx.load(io.BytesIO(onnx_data)) - run_onnx = get_run_onnx(onnx_model) - input_shapes = {inp.name:tuple(x.dim_value for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input} - - # run the model - inputs = {k:Tensor.empty(*shp) for k,shp in input_shapes.items()} - ret: Tensor = next(iter(run_onnx(inputs).values())).cast(dtypes.float32).contiguous() - schedule = create_schedule([ret.lazydata]) - - # filter schedule that don't depend on the inputs - input_lb = [x.lazydata.base.buffer for x in inputs.values()] - depends = set(input_lb) - for si in schedule: - if any(b in depends for b in si.inputs): - for out in si.outputs: depends.add(out) - - # run all kernels that don't depend on the inputs - # NOTE: there's two extra kernels due to fusions that now happen since the weights aren't realized - schedule, schedule_independent = partition(schedule, lambda si: any(out in depends for out in si.outputs)) - print(f"{len(schedule)} schedule items depend on the input, {len(schedule_independent)} don't") - - # confirm no non-sink metaop in the (non independent) schedule except for the ones that load the input buffers - assert all(si.ast.op is Ops.SINK or out in input_lb for si in schedule for out in si.outputs), "has non SINK ops, can't compile to Thneed" - return schedule, schedule_independent, inputs - -def test_vs_onnx(onnx_data, eis:Optional[List[ExecItem]], inputs:Dict[str, Tensor]): - import onnx - #import pyopencl as cl - #from extra.thneed import Thneed - import numpy as np - onnx_model = onnx.load(io.BytesIO(onnx_data)) - - input_shapes = {inp.name:tuple(x.dim_value for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input} - Tensor.manual_seed(1337) - new_inputs = {k:Tensor.randn(*shp, requires_grad=False)*8 for k,shp in input_shapes.items()} - new_np_inputs = {k:v.realize().numpy() for k,v in new_inputs.items()} - - if getenv("ORT"): - # test with onnxruntime - import onnxruntime as ort - onnx_session = ort.InferenceSession(onnx_data) - onnx_output = onnx_session.run([onnx_model.graph.output[0].name], {k:v.astype(np.float16) for k,v in new_np_inputs.items()}) - new_torch_out = onnx_output[0] - print("got ort outputs") - else: - # test with torch - from test.models.test_onnx import run_onnx_torch - new_torch_out = run_onnx_torch(onnx_model, new_np_inputs).numpy() - print("got torch outputs") - - # if you don't have a schedule - if eis is None: - run_onnx = get_run_onnx(onnx_model) - new_tinygrad_out = next(iter(run_onnx(new_inputs).values())).cast(dtypes.float32).numpy() - np.testing.assert_allclose(new_torch_out, new_tinygrad_out, atol=1e-4, rtol=1e-2) - print("classic self-test passed!") - return - - # set inputs - for k,v in inputs.items(): v.lazydata.base.realized.copyin(new_np_inputs[k].data) - - # run code (all buffers have been allocated) - GlobalCounters.reset() - output = eis[-1].bufs[0] - for ei in eis: ei.run() - - new_tinygrad_out = np.frombuffer(output.as_buffer(), dtype=_to_np_dtype(output.dtype)) - np.testing.assert_allclose(new_torch_out.reshape(new_tinygrad_out.shape), new_tinygrad_out, atol=1e-4, rtol=1e-2) - print("semi-thneed self-test passed!") - -if __name__ == "__main__": - onnx_data = fetch(sys.argv[1] if len(sys.argv) > 1 else OPENPILOT_MODEL).read_bytes() - - # quick test for ONNX issues - #thneed_test_onnx(onnx_data, None) - #exit(0) - - schedule, schedule_independent, inputs = get_schedule(onnx_data) - schedule, schedule_input = partition(schedule, lambda x: x.ast.op is Ops.SINK) - print(f"{len(schedule_input)} inputs") - - run_schedule(schedule_independent) - run_schedule(schedule_input) - with Context(DEBUG=max(DEBUG.value, 2), BEAM=getenv("LATEBEAM")): - schedule = memory_planner(schedule) - for si in schedule: - for b in si.outputs: - assert not b.is_allocated(), "output should not be allocated" - image_count = sum(isinstance(out.dtype, ImageDType) for si in schedule for out in si.outputs) - print(f"**** compiling real kernels {image_count}/{len(schedule)} images ****") - eis = list(tqdm(lower_schedule(schedule), total=len(schedule))) - - print("kernel count:", len(eis)) - assert len(eis) <= getenv("ALLOWED_KERNEL_COUNT", 0) or getenv("ALLOWED_KERNEL_COUNT", 0) == 0, "too many kernels!" - - # new simple thneed - def to_ref(b:Buffer): return struct.pack("Q", id(b)).decode("latin_1") - - seen_buffers = set() - input_buffers = [x.lazydata.buffer for x in inputs.values()] - jdat = {"binaries": [], "programs": {}, "kernels": [], "objects": []} - jdat["inputs"] = {k:to_ref(v.lazydata.buffer) for k,v in inputs.items()} - jdat["outputs"] = [to_ref(eis[-1].bufs[0])] - weights = [] - for i,ei in enumerate(eis): - #print("***", i) - for b in ei.bufs: - needs_load = b.is_allocated() and b not in input_buffers - #print(b, needs_load) - if b in seen_buffers: continue - seen_buffers.add(b) - if isinstance(b.dtype, ImageDType): - base_dtype = dtypes.float16 if b.dtype.fmt == 'e' else dtypes.float32 - row_pitch = (b.dtype.shape[0]*4*base_dtype.itemsize + 63)//64 * 64 - size = row_pitch * b.dtype.shape[1] - jdat['objects'].append({ - "id": to_ref(b), "needs_load": needs_load, "size": size, "arg_type": "image2d_t", - "width": b.dtype.shape[0], "height": b.dtype.shape[1], "row_pitch": row_pitch, "float32": b.dtype.base == dtypes.float32, - }) - if needs_load: - t = Tensor.empty(b.dtype.shape, dtype=b.dtype) - t.lazydata.buffer = b - data = t.cast(dtypes.float32).pad(((0, row_pitch//(4*base_dtype.itemsize)-b.dtype.shape[0]), (0,0), (0,0))).contiguous().numpy() - # NOTE: this cast must be done in numpy for platforms that don't support half - if base_dtype == dtypes.float16: data = data.astype(np.float16) - weights.append(data.tobytes()) - assert len(weights[-1]) == size, "wrong size buffer" - else: - jdat['objects'].append({ - "id": to_ref(b), "arg_type": b.dtype.name + "*", "needs_load": needs_load, "size": b.nbytes, - }) - if needs_load: - weights.append(b.as_buffer()) - assert len(weights[-1]) == b.nbytes, "wrong size buffer" - - saved_binaries = set() - binaries = [] - gated_read_image_count = 0 - GlobalCounters.reset() - with Context(DEBUG=max(DEBUG.value, 2)): - for ei in eis: - prg = cast(CompiledRunner, ei.prg) - assert len(prg.p.vars) == 0 - if prg.p.function_name not in saved_binaries: - jdat['binaries'].append({"name":prg.p.function_name, "length":len(prg.lib)}) - binaries.append(prg.lib) - saved_binaries.add(prg.p.function_name) - gated_read_image_count += prg.p.src.count("?read_image") - ei.run() - jdat['kernels'].append({ - "name": prg.p.function_name, - "work_dim": len(prg.p.global_size), - "global_work_size": prg.p.global_size, - "local_work_size": prg.p.local_size, - "num_args": len(ei.bufs), - "args": [to_ref(b) for b in ei.bufs], - "arg_size": [8]*len(ei.bufs), - }) - - if (allowed_gated_read_image:=getenv("ALLOWED_GATED_READ_IMAGE", -1)) != -1: - assert gated_read_image_count <= allowed_gated_read_image, \ - f"too many gated read_image! {gated_read_image_count=}, {allowed_gated_read_image=}" - - output_fn = sys.argv[2] if len(sys.argv) >= 3 else "/tmp/output.thneed" - print(f"saving thneed to {output_fn} with {len(weights)} buffers and {len(binaries)} binaries") - with open(output_fn, "wb") as f: - j = json.dumps(jdat, ensure_ascii=False).encode('latin_1') - f.write(struct.pack("I", len(j))) - f.write(j) - for w in weights: f.write(w) - for b in binaries: f.write(b) - print("saved", f.tell(), "bytes") - - FLOAT16 = getenv("FLOAT16", 0) - if FLOAT16 == 0: - try: - test_vs_onnx(onnx_data, eis, inputs) - except ModuleNotFoundError as e: - print(f"TEST NOT HAPPENING {e}") - - diff --git a/examples/openpilot/compile3.py b/examples/openpilot/compile3.py index cc338bfbd5..47ca0a41b2 100644 --- a/examples/openpilot/compile3.py +++ b/examples/openpilot/compile3.py @@ -8,6 +8,7 @@ if "JIT_BATCH_SIZE" not in os.environ: os.environ["JIT_BATCH_SIZE"] = "0" from tinygrad import fetch, Tensor, TinyJit, Context, GlobalCounters, Device from tinygrad.helpers import DEBUG, getenv from tinygrad.tensor import _from_np_dtype +from tinygrad.engine.realize import CompiledRunner import onnx from onnx.helper import tensor_dtype_to_np_dtype @@ -16,12 +17,11 @@ from extra.onnx import get_run_onnx # TODO: port to main tinygrad OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx" OUTPUT = "/tmp/openpilot.pkl" -def compile(): +def compile(onnx_file): + onnx_model = onnx.load(onnx_file) Tensor.no_grad = True Tensor.training = False - onnx_bytes = fetch(OPENPILOT_MODEL) - onnx_model = onnx.load(onnx_bytes) run_onnx = get_run_onnx(onnx_model) print("loaded model") @@ -48,23 +48,29 @@ def compile(): np.testing.assert_equal(test_val, ret, "JIT run failed") print("jit run validated") + # checks from compile2 + kernel_count = 0 + gated_read_image_count = 0 + for ei in run_onnx_jit.captured.jit_cache: + if isinstance(ei.prg, CompiledRunner): + kernel_count += 1 + gated_read_image_count += ei.prg.p.src.count("?read_image") + print(f"kernel_count: {kernel_count} gated_read_image_count: {gated_read_image_count}") + assert kernel_count <= getenv("ALLOWED_KERNEL_COUNT", 0) or getenv("ALLOWED_KERNEL_COUNT", 0) == 0, "too many kernels!" + if (allowed_gated_read_image:=getenv("ALLOWED_GATED_READ_IMAGE", -1)) != -1: + assert gated_read_image_count <= allowed_gated_read_image, \ + f"too many gated read_image! {gated_read_image_count=}, {allowed_gated_read_image=}" + with open(OUTPUT, "wb") as f: pickle.dump(run_onnx_jit, f) - mdl_sz = os.path.getsize(onnx_bytes) + mdl_sz = os.path.getsize(onnx_file) pkl_sz = os.path.getsize(OUTPUT) print(f"mdl size is {mdl_sz/1e6:.2f}M") print(f"pkl size is {pkl_sz/1e6:.2f}M") print("**** compile done ****") return test_val -def test(test_val=None): - with open(OUTPUT, "rb") as f: - run = pickle.load(f) - - # same randomness as above - Tensor.manual_seed(100) - new_inputs = {nm:Tensor.randn(*st.shape, dtype=dtype).mul(8).realize() for nm, (st, _, dtype, _) in - sorted(zip(run.captured.expected_names, run.captured.expected_st_vars_dtype_device))} +def test_vs_compile(run, new_inputs, test_val=None): new_inputs_numpy = {k:v.numpy() for k,v in new_inputs.items()} # create fake "from_blob" tensors for the inputs, and wrapped NPY tensors for the numpy inputs (these have the same underlying memory) @@ -88,8 +94,39 @@ def test(test_val=None): out = run(**inputs) changed_val = out.numpy() np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, val, changed_val) + return val + +def test_vs_onnx(new_inputs, test_val, onnx_file): + new_inputs_numpy = {k:v.numpy() for k,v in new_inputs.items()} + onnx_model = onnx.load(onnx_file) + + if getenv("ORT"): + # test with onnxruntime + import onnxruntime as ort + onnx_session = ort.InferenceSession(onnx_file) + onnx_output = onnx_session.run([onnx_model.graph.output[0].name], {k:v.astype(np.float16) for k,v in new_inputs_numpy.items()}) + new_torch_out = onnx_output[0] + print("got ort outputs") + else: + # test with torch + from test.models.test_onnx import run_onnx_torch + # NOTE: we have to correct the order here + new_torch_out = run_onnx_torch(onnx_model, {k.name:new_inputs_numpy[k.name] for k in onnx_model.graph.input}).numpy() + print("got torch outputs") + + np.testing.assert_allclose(new_torch_out.reshape(test_val.shape), test_val, atol=1e-4, rtol=1e-2) + print("test vs onnx passed") if __name__ == "__main__": - test_val = compile() if not getenv("RUN") else None - test(test_val) + onnx_file = fetch(OPENPILOT_MODEL) + test_val = compile(onnx_file) if not getenv("RUN") else None + with open(OUTPUT, "rb") as f: pickle_loaded = pickle.load(f) + + # same randomness as compile + Tensor.manual_seed(100) + new_inputs = {nm:Tensor.randn(*st.shape, dtype=dtype).mul(8).realize() for nm, (st, _, dtype, _) in + sorted(zip(pickle_loaded.captured.expected_names, pickle_loaded.captured.expected_st_vars_dtype_device))} + + test_val = test_vs_compile(pickle_loaded, new_inputs, test_val) + if not getenv("FLOAT16"): test_vs_onnx(new_inputs, test_val, onnx_file) diff --git a/examples/openpilot/go.sh b/examples/openpilot/go.sh deleted file mode 100755 index dbd17a5e96..0000000000 --- a/examples/openpilot/go.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -NOLOCALS=1 FLOAT16=1 DEBUGCL=1 IMAGE=2 GPU=1 python3 examples/openpilot/compile2.py