diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2cb00af76c..442444a57f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -197,11 +197,11 @@ jobs: # - name: Run LLaMA 7B on 6 GPUs # run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt - name: Run LLaMA-3 8B BEAM - run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt + run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - name: Run LLaMA-3 8B on 4 GPUs - run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt + run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt - name: Run LLaMA-3 8B on 6 GPUs - run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt + run: NV=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt - name: Run LLaMA-2 70B run: NV=1 RUN_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt - name: Run Mixtral 8x7B @@ -380,11 +380,11 @@ jobs: # - name: Run LLaMA 7B on 6 GPUs # run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt - name: Run LLaMA-3 8B BEAM - run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt + run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt - name: Run LLaMA-3 8B on 4 GPUs - run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt + run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt - name: Run LLaMA-3 8B on 6 GPUs - run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt + run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt - name: Run LLaMA-2 70B run: AMD=1 RUN_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt - name: Run Mixtral 8x7B @@ -508,10 +508,6 @@ jobs: rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal - name: reset process replay run: test/external/process_replay/reset.py - - name: openpilot compile 0.9.4 - run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python examples/openpilot/compile2.py | tee openpilot_compile_0_9_4.txt - - name: openpilot compile 0.9.7 - run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python examples/openpilot/compile2.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_compile_0_9_7.txt - name: validate openpilot 0.9.7 run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt - name: benchmark openpilot 0.9.4 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c2c553c9af..f17b70ce8f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,7 +1,7 @@ name: Unit Tests env: # increment this when downloads substantially change to avoid the internet - DOWNLOAD_CACHE_VERSION: '7' + DOWNLOAD_CACHE_VERSION: '8' RUN_PROCESS_REPLAY: 1 GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PYTHONPATH: . @@ -293,22 +293,15 @@ jobs: PYTHONPATH="." GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20 PYTHONPATH="." GPU=1 IMAGE=2 python3 test/models/test_end2end.py TestEnd2End.test_linear_mnist - if: ${{ matrix.task == 'optimage' }} - name: Test openpilot model compile and size + name: Test openpilot model kernel count and gate usage run: | - PYTHONPATH="." DEBUG=2 ALLOWED_KERNEL_COUNT=208 ALLOWED_GATED_READ_IMAGE=13 FLOAT16=1 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py - python -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000' - - if: ${{ matrix.task == 'optimage' }} - name: Test openpilot model correctness (float32) - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py - - if: ${{ matrix.task == 'optimage' }} - name: Test openpilot compile3 - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py + PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_GATED_READ_IMAGE=13 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'optimage' }} name: Test openpilot alt model correctness (float32) - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx + run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'optimage' }} name: Test openpilot fastvits model correctness (float32) - run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile2.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx + run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx - if: ${{ matrix.task == 'onnx' }} name: Test ONNX (GPU) run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 @@ -387,7 +380,7 @@ jobs: WEBGPU=1 WGPU_BACKEND_TYPE=Vulkan python3 -m pytest -n=auto test/test_assign.py test/test_arange.py test/test_const_folding.py test/test_dtype.py \ test/test_dtype_alu.py test/test_conv.py test/test_conv_shapetracker.py test/test_nn.py test/test_ops.py test/test_optim.py \ test/test_jit.py test/test_randomness.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_uops_stats.py test/test_uops.py \ - --durations=20 + test/testextra/test_export_model.py test/testextra/test_f16_decompress.py --durations=20 - name: Run process replay tests run: | export PR_TITLE=$(jq -r .pull_request.title "$GITHUB_EVENT_PATH") @@ -439,7 +432,7 @@ jobs: - name: Test Beam Search run: PYTHONPATH="." METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py - name: Fuzz Test linearizer - run: PYTHONPATH="." METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=2 FUZZ_N=24 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py + run: PYTHONPATH="." METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py # - name: Fuzz Test models schedule # run: FUZZ_SCHEDULE=1 FUZZ_SCHEDULE_MAX_PATHS=5 python -m pytest test/models/test_train.py test/models/test_end2end.py - name: Run TRANSCENDENTAL math @@ -528,7 +521,7 @@ jobs: if: matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv' run: | cd ${{ github.workspace }}/gpuocelot/ocelot/build - sudo ninja install -d explain + sudo cp libgpuocelot.so /usr/lib/libgpuocelot.so - name: Install packages (amd) if: matrix.backend == 'amd' run: | diff --git a/examples/llama3.py b/examples/llama3.py index d172cc9b4a..e331573d1a 100644 --- a/examples/llama3.py +++ b/examples/llama3.py @@ -220,7 +220,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--download_model", action="store_true", help="Download a model") parser.add_argument("--model", type=Path, help="Model path") - parser.add_argument("--size", choices=["1B", "8B", "70B"], default="8B", help="Model size") + parser.add_argument("--size", choices=["1B", "8B", "70B"], default="1B", help="Model size") parser.add_argument("--shard", type=int, default=1, help="Shard the model across multiple devices") parser.add_argument("--quantize", choices=["int8", "nf4", "float16"], help="Quantization method") parser.add_argument("--no_api", action="store_true", help="Disable the api and run a cli test interface") @@ -234,8 +234,8 @@ if __name__ == "__main__": parser.add_argument("--profile", action="store_true", help="Output profile data") args = parser.parse_args() - assert (args.model and not args.download_model) or (not args.model and args.download_model), "either download or provide model" - if args.download_model: + # download_model is the default without a model passed in + if args.download_model or not args.model: if args.size == "1B": fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model", "tokenizer.model", subdir="llama3-1b-instruct") args.model = fetch("https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q6_K.gguf", "Llama-3.2-1B-Instruct-Q6_K.gguf", subdir="llama3-1b-instruct") diff --git a/examples/openpilot/compile2.py b/examples/openpilot/compile2.py deleted file mode 100644 index d862bf203c..0000000000 --- a/examples/openpilot/compile2.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python3 -import os, sys, io, pathlib, json, struct -import numpy as np -sys.path.insert(0, str(pathlib.Path(__file__).parents[1])) - -if "FLOAT16" not in os.environ: os.environ["FLOAT16"] = "1" -if "IMAGE" not in os.environ: os.environ["IMAGE"] = "2" -if "NOLOCALS" not in os.environ: os.environ["NOLOCALS"] = "1" - -OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx" - -import onnx -from typing import Tuple, List, Optional, Dict, cast -from extra.onnx import get_run_onnx -from tinygrad import Tensor, Device, GlobalCounters, dtypes -from tinygrad.dtype import ImageDType -from tinygrad.device import Buffer -from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG, tqdm -from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem, CompiledRunner -from tinygrad.engine.memory import memory_planner -from tinygrad.engine.schedule import ScheduleItem, create_schedule -from tinygrad.ops import Ops -from tinygrad.tensor import _to_np_dtype -Device.DEFAULT = "GPU" - -def get_schedule(onnx_data) -> Tuple[List[ScheduleItem], List[ScheduleItem]]: - Tensor.no_grad = True - Tensor.training = False - - # load the model - onnx_model = onnx.load(io.BytesIO(onnx_data)) - run_onnx = get_run_onnx(onnx_model) - input_shapes = {inp.name:tuple(x.dim_value for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input} - - # run the model - inputs = {k:Tensor.empty(*shp) for k,shp in input_shapes.items()} - ret: Tensor = next(iter(run_onnx(inputs).values())).cast(dtypes.float32).contiguous() - schedule = create_schedule([ret.lazydata]) - - # filter schedule that don't depend on the inputs - input_lb = [x.lazydata.base.buffer for x in inputs.values()] - depends = set(input_lb) - for si in schedule: - if any(b in depends for b in si.inputs): - for out in si.outputs: depends.add(out) - - # run all kernels that don't depend on the inputs - # NOTE: there's two extra kernels due to fusions that now happen since the weights aren't realized - schedule, schedule_independent = partition(schedule, lambda si: any(out in depends for out in si.outputs)) - print(f"{len(schedule)} schedule items depend on the input, {len(schedule_independent)} don't") - - # confirm no non-sink metaop in the (non independent) schedule except for the ones that load the input buffers - assert all(si.ast.op is Ops.SINK or out in input_lb for si in schedule for out in si.outputs), "has non SINK ops, can't compile to Thneed" - return schedule, schedule_independent, inputs - -def test_vs_onnx(onnx_data, eis:Optional[List[ExecItem]], inputs:Dict[str, Tensor]): - import onnx - #import pyopencl as cl - #from extra.thneed import Thneed - import numpy as np - onnx_model = onnx.load(io.BytesIO(onnx_data)) - - input_shapes = {inp.name:tuple(x.dim_value for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input} - Tensor.manual_seed(1337) - new_inputs = {k:Tensor.randn(*shp, requires_grad=False)*8 for k,shp in input_shapes.items()} - new_np_inputs = {k:v.realize().numpy() for k,v in new_inputs.items()} - - if getenv("ORT"): - # test with onnxruntime - import onnxruntime as ort - onnx_session = ort.InferenceSession(onnx_data) - onnx_output = onnx_session.run([onnx_model.graph.output[0].name], {k:v.astype(np.float16) for k,v in new_np_inputs.items()}) - new_torch_out = onnx_output[0] - print("got ort outputs") - else: - # test with torch - from test.models.test_onnx import run_onnx_torch - new_torch_out = run_onnx_torch(onnx_model, new_np_inputs).numpy() - print("got torch outputs") - - # if you don't have a schedule - if eis is None: - run_onnx = get_run_onnx(onnx_model) - new_tinygrad_out = next(iter(run_onnx(new_inputs).values())).cast(dtypes.float32).numpy() - np.testing.assert_allclose(new_torch_out, new_tinygrad_out, atol=1e-4, rtol=1e-2) - print("classic self-test passed!") - return - - # set inputs - for k,v in inputs.items(): v.lazydata.base.realized.copyin(new_np_inputs[k].data) - - # run code (all buffers have been allocated) - GlobalCounters.reset() - output = eis[-1].bufs[0] - for ei in eis: ei.run() - - new_tinygrad_out = np.frombuffer(output.as_buffer(), dtype=_to_np_dtype(output.dtype)) - np.testing.assert_allclose(new_torch_out.reshape(new_tinygrad_out.shape), new_tinygrad_out, atol=1e-4, rtol=1e-2) - print("semi-thneed self-test passed!") - -if __name__ == "__main__": - onnx_data = fetch(sys.argv[1] if len(sys.argv) > 1 else OPENPILOT_MODEL).read_bytes() - - # quick test for ONNX issues - #thneed_test_onnx(onnx_data, None) - #exit(0) - - schedule, schedule_independent, inputs = get_schedule(onnx_data) - schedule, schedule_input = partition(schedule, lambda x: x.ast.op is Ops.SINK) - print(f"{len(schedule_input)} inputs") - - run_schedule(schedule_independent) - run_schedule(schedule_input) - with Context(DEBUG=max(DEBUG.value, 2), BEAM=getenv("LATEBEAM")): - schedule = memory_planner(schedule) - for si in schedule: - for b in si.outputs: - assert not b.is_allocated(), "output should not be allocated" - image_count = sum(isinstance(out.dtype, ImageDType) for si in schedule for out in si.outputs) - print(f"**** compiling real kernels {image_count}/{len(schedule)} images ****") - eis = list(tqdm(lower_schedule(schedule), total=len(schedule))) - - print("kernel count:", len(eis)) - assert len(eis) <= getenv("ALLOWED_KERNEL_COUNT", 0) or getenv("ALLOWED_KERNEL_COUNT", 0) == 0, "too many kernels!" - - # new simple thneed - def to_ref(b:Buffer): return struct.pack("Q", id(b)).decode("latin_1") - - seen_buffers = set() - input_buffers = [x.lazydata.buffer for x in inputs.values()] - jdat = {"binaries": [], "programs": {}, "kernels": [], "objects": []} - jdat["inputs"] = {k:to_ref(v.lazydata.buffer) for k,v in inputs.items()} - jdat["outputs"] = [to_ref(eis[-1].bufs[0])] - weights = [] - for i,ei in enumerate(eis): - #print("***", i) - for b in ei.bufs: - needs_load = b.is_allocated() and b not in input_buffers - #print(b, needs_load) - if b in seen_buffers: continue - seen_buffers.add(b) - if isinstance(b.dtype, ImageDType): - base_dtype = dtypes.float16 if b.dtype.fmt == 'e' else dtypes.float32 - row_pitch = (b.dtype.shape[0]*4*base_dtype.itemsize + 63)//64 * 64 - size = row_pitch * b.dtype.shape[1] - jdat['objects'].append({ - "id": to_ref(b), "needs_load": needs_load, "size": size, "arg_type": "image2d_t", - "width": b.dtype.shape[0], "height": b.dtype.shape[1], "row_pitch": row_pitch, "float32": b.dtype.base == dtypes.float32, - }) - if needs_load: - t = Tensor.empty(b.dtype.shape, dtype=b.dtype) - t.lazydata.buffer = b - data = t.cast(dtypes.float32).pad(((0, row_pitch//(4*base_dtype.itemsize)-b.dtype.shape[0]), (0,0), (0,0))).contiguous().numpy() - # NOTE: this cast must be done in numpy for platforms that don't support half - if base_dtype == dtypes.float16: data = data.astype(np.float16) - weights.append(data.tobytes()) - assert len(weights[-1]) == size, "wrong size buffer" - else: - jdat['objects'].append({ - "id": to_ref(b), "arg_type": b.dtype.name + "*", "needs_load": needs_load, "size": b.nbytes, - }) - if needs_load: - weights.append(b.as_buffer()) - assert len(weights[-1]) == b.nbytes, "wrong size buffer" - - saved_binaries = set() - binaries = [] - gated_read_image_count = 0 - GlobalCounters.reset() - with Context(DEBUG=max(DEBUG.value, 2)): - for ei in eis: - prg = cast(CompiledRunner, ei.prg) - assert len(prg.p.vars) == 0 - if prg.p.function_name not in saved_binaries: - jdat['binaries'].append({"name":prg.p.function_name, "length":len(prg.lib)}) - binaries.append(prg.lib) - saved_binaries.add(prg.p.function_name) - gated_read_image_count += prg.p.src.count("?read_image") - ei.run() - jdat['kernels'].append({ - "name": prg.p.function_name, - "work_dim": len(prg.p.global_size), - "global_work_size": prg.p.global_size, - "local_work_size": prg.p.local_size, - "num_args": len(ei.bufs), - "args": [to_ref(b) for b in ei.bufs], - "arg_size": [8]*len(ei.bufs), - }) - - if (allowed_gated_read_image:=getenv("ALLOWED_GATED_READ_IMAGE", -1)) != -1: - assert gated_read_image_count <= allowed_gated_read_image, \ - f"too many gated read_image! {gated_read_image_count=}, {allowed_gated_read_image=}" - - output_fn = sys.argv[2] if len(sys.argv) >= 3 else "/tmp/output.thneed" - print(f"saving thneed to {output_fn} with {len(weights)} buffers and {len(binaries)} binaries") - with open(output_fn, "wb") as f: - j = json.dumps(jdat, ensure_ascii=False).encode('latin_1') - f.write(struct.pack("I", len(j))) - f.write(j) - for w in weights: f.write(w) - for b in binaries: f.write(b) - print("saved", f.tell(), "bytes") - - FLOAT16 = getenv("FLOAT16", 0) - if FLOAT16 == 0: - try: - test_vs_onnx(onnx_data, eis, inputs) - except ModuleNotFoundError as e: - print(f"TEST NOT HAPPENING {e}") - - diff --git a/examples/openpilot/compile3.py b/examples/openpilot/compile3.py index 87776e777c..47ca0a41b2 100644 --- a/examples/openpilot/compile3.py +++ b/examples/openpilot/compile3.py @@ -5,9 +5,10 @@ if "IMAGE" not in os.environ: os.environ["IMAGE"] = "2" if "NOLOCALS" not in os.environ: os.environ["NOLOCALS"] = "1" if "JIT_BATCH_SIZE" not in os.environ: os.environ["JIT_BATCH_SIZE"] = "0" -from tinygrad import fetch, Tensor, TinyJit, Context, GlobalCounters +from tinygrad import fetch, Tensor, TinyJit, Context, GlobalCounters, Device from tinygrad.helpers import DEBUG, getenv from tinygrad.tensor import _from_np_dtype +from tinygrad.engine.realize import CompiledRunner import onnx from onnx.helper import tensor_dtype_to_np_dtype @@ -16,12 +17,11 @@ from extra.onnx import get_run_onnx # TODO: port to main tinygrad OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx" OUTPUT = "/tmp/openpilot.pkl" -def compile(): +def compile(onnx_file): + onnx_model = onnx.load(onnx_file) Tensor.no_grad = True Tensor.training = False - onnx_bytes = fetch(OPENPILOT_MODEL) - onnx_model = onnx.load(onnx_bytes) run_onnx = get_run_onnx(onnx_model) print("loaded model") @@ -30,51 +30,103 @@ def compile(): if getenv("FLOAT16", 0) == 0: input_types = {k:(np.float32 if v==np.float16 else v) for k,v in input_types.items()} Tensor.manual_seed(100) new_inputs = {k:Tensor.randn(*shp, dtype=_from_np_dtype(input_types[k])).mul(8).realize() for k,shp in sorted(input_shapes.items())} + new_inputs_numpy = {k:v.numpy() for k,v in new_inputs.items()} print("created tensors") - run_onnx_jit = TinyJit(lambda **kwargs: run_onnx(kwargs), prune=True) + run_onnx_jit = TinyJit(lambda **kwargs: + next(iter(run_onnx({k:v.to(Device.DEFAULT) for k,v in kwargs.items()}).values())).cast('float32'), prune=True) for i in range(3): GlobalCounters.reset() print(f"run {i}") + inputs = {**{k:v.clone() for k,v in new_inputs.items() if 'img' in k}, + **{k:Tensor(v, device="NPY").realize() for k,v in new_inputs_numpy.items() if 'img' not in k}} with Context(DEBUG=max(DEBUG.value, 2 if i == 2 else 1)): - ret = next(iter(run_onnx_jit(**new_inputs).values())).cast('float32').numpy() + ret = run_onnx_jit(**inputs).numpy() # copy i == 1 so use of JITBEAM is okay if i == 1: test_val = np.copy(ret) print(f"captured {len(run_onnx_jit.captured.jit_cache)} kernels") - np.testing.assert_equal(test_val, ret) + np.testing.assert_equal(test_val, ret, "JIT run failed") print("jit run validated") + # checks from compile2 + kernel_count = 0 + gated_read_image_count = 0 + for ei in run_onnx_jit.captured.jit_cache: + if isinstance(ei.prg, CompiledRunner): + kernel_count += 1 + gated_read_image_count += ei.prg.p.src.count("?read_image") + print(f"kernel_count: {kernel_count} gated_read_image_count: {gated_read_image_count}") + assert kernel_count <= getenv("ALLOWED_KERNEL_COUNT", 0) or getenv("ALLOWED_KERNEL_COUNT", 0) == 0, "too many kernels!" + if (allowed_gated_read_image:=getenv("ALLOWED_GATED_READ_IMAGE", -1)) != -1: + assert gated_read_image_count <= allowed_gated_read_image, \ + f"too many gated read_image! {gated_read_image_count=}, {allowed_gated_read_image=}" + with open(OUTPUT, "wb") as f: pickle.dump(run_onnx_jit, f) - mdl_sz = os.path.getsize(onnx_bytes) + mdl_sz = os.path.getsize(onnx_file) pkl_sz = os.path.getsize(OUTPUT) print(f"mdl size is {mdl_sz/1e6:.2f}M") print(f"pkl size is {pkl_sz/1e6:.2f}M") print("**** compile done ****") return test_val -def test(test_val=None): - with open(OUTPUT, "rb") as f: - run = pickle.load(f) - Tensor.manual_seed(100) - new_inputs = {nm:Tensor.randn(*st.shape, dtype=dtype).mul(8).realize() for nm, (st, _, dtype, _) in - sorted(zip(run.captured.expected_names, run.captured.expected_st_vars_dtype_device))} +def test_vs_compile(run, new_inputs, test_val=None): new_inputs_numpy = {k:v.numpy() for k,v in new_inputs.items()} + + # create fake "from_blob" tensors for the inputs, and wrapped NPY tensors for the numpy inputs (these have the same underlying memory) + inputs = {**{k:v for k,v in new_inputs.items() if 'img' in k}, + **{k:Tensor(v, device="NPY").realize() for k,v in new_inputs_numpy.items() if 'img' not in k}} + + # run 20 times for _ in range(20): st = time.perf_counter() - # Need to cast non-image inputs from numpy, this is only realistic way to run it - inputs = {**{k:v for k,v in new_inputs.items() if 'img' in k}, - **{k:Tensor(v) for k,v in new_inputs_numpy.items() if 'img' not in k}} out = run(**inputs) mt = time.perf_counter() - val = out['outputs'].numpy() + val = out.numpy() et = time.perf_counter() print(f"enqueue {(mt-st)*1e3:6.2f} ms -- total run {(et-st)*1e3:6.2f} ms") print(out, val.shape, val.dtype) if test_val is not None: np.testing.assert_equal(test_val, val) print("**** test done ****") -if __name__ == "__main__": - test_val = compile() if not getenv("RUN") else None - test(test_val) + # test that changing the numpy changes the model outputs + for v in new_inputs_numpy.values(): v *= 2 + out = run(**inputs) + changed_val = out.numpy() + np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, val, changed_val) + return val +def test_vs_onnx(new_inputs, test_val, onnx_file): + new_inputs_numpy = {k:v.numpy() for k,v in new_inputs.items()} + onnx_model = onnx.load(onnx_file) + + if getenv("ORT"): + # test with onnxruntime + import onnxruntime as ort + onnx_session = ort.InferenceSession(onnx_file) + onnx_output = onnx_session.run([onnx_model.graph.output[0].name], {k:v.astype(np.float16) for k,v in new_inputs_numpy.items()}) + new_torch_out = onnx_output[0] + print("got ort outputs") + else: + # test with torch + from test.models.test_onnx import run_onnx_torch + # NOTE: we have to correct the order here + new_torch_out = run_onnx_torch(onnx_model, {k.name:new_inputs_numpy[k.name] for k in onnx_model.graph.input}).numpy() + print("got torch outputs") + + np.testing.assert_allclose(new_torch_out.reshape(test_val.shape), test_val, atol=1e-4, rtol=1e-2) + print("test vs onnx passed") + +if __name__ == "__main__": + onnx_file = fetch(OPENPILOT_MODEL) + test_val = compile(onnx_file) if not getenv("RUN") else None + + with open(OUTPUT, "rb") as f: pickle_loaded = pickle.load(f) + + # same randomness as compile + Tensor.manual_seed(100) + new_inputs = {nm:Tensor.randn(*st.shape, dtype=dtype).mul(8).realize() for nm, (st, _, dtype, _) in + sorted(zip(pickle_loaded.captured.expected_names, pickle_loaded.captured.expected_st_vars_dtype_device))} + + test_val = test_vs_compile(pickle_loaded, new_inputs, test_val) + if not getenv("FLOAT16"): test_vs_onnx(new_inputs, test_val, onnx_file) diff --git a/examples/openpilot/go.sh b/examples/openpilot/go.sh deleted file mode 100755 index dbd17a5e96..0000000000 --- a/examples/openpilot/go.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -NOLOCALS=1 FLOAT16=1 DEBUGCL=1 IMAGE=2 GPU=1 python3 examples/openpilot/compile2.py diff --git a/examples/self_tokenize.py b/examples/self_tokenize.py new file mode 100644 index 0000000000..372f1ac5a8 --- /dev/null +++ b/examples/self_tokenize.py @@ -0,0 +1,35 @@ +import os, pathlib +from examples.llama3 import Tokenizer +from tabulate import tabulate +from tinygrad import fetch +from tinygrad.helpers import flatten + +# llama 3 tokenizer +tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix()) + +def read_code(base_path): + ret = [] + for path, _, files in os.walk(os.path.join(base_path, "tinygrad")): + for name in files: + if not name.endswith(".py"): continue + if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue + fullpath = os.path.join(path, name) + code = pathlib.Path(fullpath).read_text() + ret += [(fullpath.split("tinygrad/", 1)[1], code)] + return ret + +if __name__ == "__main__": + ret = read_code(".") + + table = [] + for name,code in ret: + table.append([name, len(tokenizer.encode(name+"\x00"+code))]) + print(tabulate([["name", "llm tokens"]]+sorted(table, key=lambda x: -x[1]), headers="firstrow")) + + code_str = '\x00'.join(flatten(ret)) + print(f"code has {len(code_str)} chars") + newline_count = code_str.count('\n') + print(f"code has {newline_count} newlines") + + encoded = tokenizer.encode(code_str) + print(f"code has {len(encoded)} tokens") diff --git a/examples/index.html b/examples/webgpu/efficientnet/index.html similarity index 97% rename from examples/index.html rename to examples/webgpu/efficientnet/index.html index dab046852e..fccd7e5744 100644 --- a/examples/index.html +++ b/examples/webgpu/efficientnet/index.html @@ -17,7 +17,7 @@ canvas { display: none; } * { text-align: center; font-family: monospace; } tinygrad has WebGPU - + @@ -61,7 +61,7 @@ canvas { display: none; } const getLabels = async () => (await fetch("https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json")).json(); - const getSavetensorBuffer = async () => new Uint8Array(await (await fetch("./net.safetensors")).arrayBuffer()); + const getSavetensorBuffer = async () => new Uint8Array(await (await fetch("../../net.safetensors")).arrayBuffer()); const reorderChannelsAndRemoveAlpha = (data) => { const out = []; diff --git a/examples/webgpu/stable_diffusion/compile.py b/examples/webgpu/stable_diffusion/compile.py index 91b3d5c3a4..a26be7c9dd 100644 --- a/examples/webgpu/stable_diffusion/compile.py +++ b/examples/webgpu/stable_diffusion/compile.py @@ -1,9 +1,10 @@ import os -from extra.export_model import compile_net, jit_model +from extra.export_model import compile_net, jit_model, dtype_to_js_type +from extra.f16_decompress import u32_to_f16 from examples.stable_diffusion import StableDiffusion from tinygrad.nn.state import get_state_dict, safe_save, safe_load_metadata, torch_load, load_state_dict from tinygrad.tensor import Tensor -from tinygrad import Device +from tinygrad import Device, dtypes from tinygrad.helpers import fetch from typing import NamedTuple, Any, List import requests @@ -29,7 +30,7 @@ def convert_f32_to_f16(input_file, output_file): rest_float32_values.tofile(f) def split_safetensor(fn): - _, json_len, metadata = safe_load_metadata(fn) + _, data_start, metadata = safe_load_metadata(fn) text_model_offset = 3772703308 chunk_size = 536870912 @@ -51,12 +52,12 @@ def split_safetensor(fn): part_offset = offset - last_offset if (part_offset >= chunk_size): - part_end_offsets.append(8+json_len+offset) + part_end_offsets.append(data_start+offset) last_offset = offset text_model_start = int(text_model_offset/2) net_bytes = bytes(open(fn, 'rb').read()) - part_end_offsets.append(text_model_start+8+json_len) + part_end_offsets.append(text_model_start+data_start) cur_pos = 0 for i, end_pos in enumerate(part_end_offsets): @@ -65,7 +66,7 @@ def split_safetensor(fn): cur_pos = end_pos with open(os.path.join(os.path.dirname(__file__), f'./net_textmodel.safetensors'), "wb+") as f: - f.write(net_bytes[text_model_start+8+json_len:]) + f.write(net_bytes[text_model_start+data_start:]) return part_end_offsets @@ -95,7 +96,8 @@ if __name__ == "__main__": sub_steps = [ Step(name = "textModel", input = [Tensor.randn(1, 77)], forward = model.cond_stage_model.transformer.text_model), Step(name = "diffusor", input = [Tensor.randn(1, 77, 768), Tensor.randn(1, 77, 768), Tensor.randn(1,4,64,64), Tensor.rand(1), Tensor.randn(1), Tensor.randn(1), Tensor.randn(1)], forward = model), - Step(name = "decoder", input = [Tensor.randn(1,4,64,64)], forward = model.decode) + Step(name = "decoder", input = [Tensor.randn(1,4,64,64)], forward = model.decode), + Step(name = "f16tof32", input = [Tensor.randn(2097120, dtype=dtypes.uint32)], forward = u32_to_f16) ] prg = "" @@ -116,19 +118,23 @@ if __name__ == "__main__": weights = {id(x.lazydata.base.realized): name for name, x in state.items()} kernel_code = '\n\n'.join([f"const {key} = `{fixup_code(code, key)}`;" for key, code in functions.items()]) kernel_names = ', '.join([name for (name, _, _, _) in statements]) + input_names = [name for _,name in special_names.items() if "input" in name] + output_names = [name for _,name in special_names.items() if "output" in name] + input_buf_types = [dtype_to_js_type(bufs[inp_name][1]) for inp_name in input_names] + output_buf_types = [dtype_to_js_type(bufs[out_name][1]) for out_name in output_names] kernel_calls = '\n '.join([f"addComputePass(device, commandEncoder, piplines[{i}], [{', '.join(args)}], {global_size});" for i, (_name, args, global_size, _local_size) in enumerate(statements) ]) - bufs = '\n '.join([f"const {name} = " + (f"createEmptyBuf(device, {size});" if _key not in weights else f"createWeightBuf(device, {size}, getTensorBuffer(safetensor, metadata['{weights[_key]}'], '{weights[_key]}'))") + ";" for name,(size,dtype,_key) in bufs.items()]) + exported_bufs = '\n '.join([f"const {name} = " + (f"createEmptyBuf(device, {size});" if _key not in weights else f"createWeightBuf(device, {size}, getTensorBuffer(safetensor, metadata['{weights[_key]}'], '{weights[_key]}'))") + ";" for name,(size,dtype,_key) in bufs.items()]) gpu_write_bufs = '\n '.join([f"const gpuWriteBuffer{i} = device.createBuffer({{size:input{i}.size, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE }});" for i,(_,value) in enumerate(special_names.items()) if "output" not in value]) - input_writer = '\n '.join([f"await gpuWriteBuffer{i}.mapAsync(GPUMapMode.WRITE);\n new Float32Array(gpuWriteBuffer{i}.getMappedRange()).set(" + f'data{i});' + f"\n gpuWriteBuffer{i}.unmap();\ncommandEncoder.copyBufferToBuffer(gpuWriteBuffer{i}, 0, input{i}, 0, gpuWriteBuffer{i}.size);" for i,(_,value) in enumerate(special_names.items()) if value != "output0"]) + input_writer = '\n '.join([f"await gpuWriteBuffer{i}.mapAsync(GPUMapMode.WRITE);\n new {input_buf_types[i]}(gpuWriteBuffer{i}.getMappedRange()).set(" + f'data{i});' + f"\n gpuWriteBuffer{i}.unmap();\ncommandEncoder.copyBufferToBuffer(gpuWriteBuffer{i}, 0, input{i}, 0, gpuWriteBuffer{i}.size);" for i,_ in enumerate(input_names)]) return f"""\n var {step.name} = function() {{ {kernel_code} return {{ "setup": async (device, safetensor) => {{ - const metadata = getTensorMetadata(safetensor[0]); + const metadata = safetensor ? getTensorMetadata(safetensor[0]) : null; - {bufs} + {exported_bufs} {gpu_write_bufs} const gpuReadBuffer = device.createBuffer({{ size: output0.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ }}); @@ -147,8 +153,8 @@ if __name__ == "__main__": device.queue.submit([gpuCommands]); await gpuReadBuffer.mapAsync(GPUMapMode.READ); - const resultBuffer = new Float32Array(gpuReadBuffer.size/4); - resultBuffer.set(new Float32Array(gpuReadBuffer.getMappedRange())); + const resultBuffer = new {output_buf_types[0]}(gpuReadBuffer.size/{bufs[output_names[0]][1].itemsize}); + resultBuffer.set(new {output_buf_types[0]}(gpuReadBuffer.getMappedRange())); gpuReadBuffer.unmap(); return resultBuffer; }} diff --git a/examples/webgpu/stable_diffusion/index.html b/examples/webgpu/stable_diffusion/index.html index 464c908f70..08488f8a84 100644 --- a/examples/webgpu/stable_diffusion/index.html +++ b/examples/webgpu/stable_diffusion/index.html @@ -165,10 +165,6 @@ import ClipTokenizer from './clip_tokenizer.js'; window.clipTokenizer = new ClipTokenizer(); - @@ -214,6 +210,8 @@