diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2bb4f344dd..7e9b2008b3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -351,46 +351,44 @@ jobs: export COMMIT_MESSAGE=$(git show -s --format=%B ${{ github.event.pull_request.head.sha }}) cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py - #testwebgpu: - # name: WebGPU Tests - # runs-on: macos-13 - # timeout-minutes: 20 - # steps: - # - name: Checkout Code - # uses: actions/checkout@v4 - # - name: Set up Python 3.11 - # uses: actions/setup-python@v5 - # with: - # python-version: 3.11 - # - name: Cache python packages - # uses: actions/cache@v4 - # with: - # path: /Users/runner/Library/Python/3.11/lib/python/site-packages - # key: webgpu-testing-user3-packages-${{ hashFiles('**/setup.py') }} - # - name: Install Dependencies - # run: pip install --user -e '.[webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu - # - name: Cache downloads - # uses: actions/cache@v4 - # with: - # path: ~/Library/Caches/tinygrad/downloads/ - # key: downloads-cache-webgpu-${{ env.DOWNLOAD_CACHE_VERSION }} - # - name: Check Device.DEFAULT (WEBGPU) and print some source - # run: | - # WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT" - # WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - #- name: Run webgpu pytest - # run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto - # - name: Run selected webgpu tests - # run: | - # WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto test/test_ops.py test/test_dtype.py \ - # test/test_jit.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_linearizer.py \ - # test/test_linearizer_failures.py test/test_nn.py - # - name: Build WEBGPU Efficientnet - # run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.compile_efficientnet - # - name: Install Puppeteer - # run: npm install puppeteer - # - name: Run WEBGPU Efficientnet - # run: node test/web/test_webgpu.js + testwebgpu: + name: WebGPU Tests + runs-on: macos-14 + timeout-minutes: 20 + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Cache python packages + uses: actions/cache@v4 + with: + path: /Users/runner/Library/Python/3.11/lib/python/site-packages + key: webgpu-testing-user3-packages-${{ hashFiles('**/setup.py') }} + - name: Install Dependencies + run: pip install --user -e '.[webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu + - name: Cache downloads + uses: actions/cache@v4 + with: + path: ~/Library/Caches/tinygrad/downloads/ + key: downloads-cache-webgpu-${{ env.DOWNLOAD_CACHE_VERSION }} + - name: Check Device.DEFAULT (WEBGPU) and print some source + run: | + WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT" + WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add + - name: Build WEBGPU Efficientnet + run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python3 -m examples.compile_efficientnet + - name: Install Puppeteer + run: npm install puppeteer + - name: Run WEBGPU Efficientnet + run: node test/web/test_webgpu.js + - name: Run selected webgpu tests + run: | + WEBGPU=1 WGPU_BACKEND_TYPE=Metal python3 -m pytest test/test_assign.py test/test_arange.py test/test_const_folding.py test/test_dtype.py \ + test/test_dtype_alu.py test/test_conv.py test/test_conv_shapetracker.py test/test_nn.py test/test_ops.py test/test_optim.py \ + test/test_randomness.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_uops_stats.py test/test_uops.py --durations=20 testmetal: name: Metal Tests diff --git a/README.md b/README.md index 5d41fc0e16..24caec6c29 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ tinygrad already supports numerous accelerators, including: - [x] [AMD](tinygrad/runtime/ops_amd.py) - [x] [NV](tinygrad/runtime/ops_nv.py) - [x] [QCOM](tinygrad/runtime/ops_qcom.py) +- [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py) And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops. diff --git a/examples/compile_efficientnet.py b/examples/compile_efficientnet.py index 24f7f4a6a6..fa0b64c450 100644 --- a/examples/compile_efficientnet.py +++ b/examples/compile_efficientnet.py @@ -1,7 +1,7 @@ from pathlib import Path from extra.models.efficientnet import EfficientNet from tinygrad.tensor import Tensor -from tinygrad.nn.state import safe_save +from tinygrad.nn.state import get_state_dict, safe_save, safe_load, load_state_dict from extra.export_model import export_model from tinygrad.helpers import getenv, fetch import ast @@ -9,11 +9,15 @@ import ast if __name__ == "__main__": model = EfficientNet(0) model.load_from_pretrained() + dirname = Path(__file__).parent + # exporting a model that's loaded from safetensors doesn't work without loading in from safetensors first + # loading the state dict from a safetensor file changes the generated kernels + if getenv("WEBGPU") or getenv("WEBGL"): + safe_save(get_state_dict(model), (dirname / "net.safetensors").as_posix()) + load_state_dict(model, safe_load(str(dirname / "net.safetensors"))) mode = "clang" if getenv("CLANG", "") != "" else "webgpu" if getenv("WEBGPU", "") != "" else "webgl" if getenv("WEBGL", "") != "" else "" prg, inp_sizes, out_sizes, state = export_model(model, mode, Tensor.randn(1,3,224,224)) - dirname = Path(__file__).parent if getenv("CLANG", "") == "": - safe_save(state, (dirname / "net.safetensors").as_posix()) ext = "js" if getenv("WEBGPU", "") != "" or getenv("WEBGL", "") != "" else "json" with open(dirname / f"net.{ext}", "w") as text_file: text_file.write(prg) diff --git a/examples/webgpu/stable_diffusion/compile.py b/examples/webgpu/stable_diffusion/compile.py index d0a5337667..91b3d5c3a4 100644 --- a/examples/webgpu/stable_diffusion/compile.py +++ b/examples/webgpu/stable_diffusion/compile.py @@ -6,7 +6,7 @@ from tinygrad.tensor import Tensor from tinygrad import Device from tinygrad.helpers import fetch from typing import NamedTuple, Any, List -from pathlib import Path +import requests import argparse import numpy as np @@ -60,16 +60,22 @@ def split_safetensor(fn): cur_pos = 0 for i, end_pos in enumerate(part_end_offsets): - with open(f'./net_part{i}.safetensors', "wb+") as f: + with open(os.path.join(os.path.dirname(__file__), f'./net_part{i}.safetensors'), "wb+") as f: f.write(net_bytes[cur_pos:end_pos]) cur_pos = end_pos - with open(f'./net_textmodel.safetensors', "wb+") as f: + with open(os.path.join(os.path.dirname(__file__), f'./net_textmodel.safetensors'), "wb+") as f: f.write(net_bytes[text_model_start+8+json_len:]) return part_end_offsets +def fetch_dep(file, url): + with open(file, "w", encoding="utf-8") as f: + f.write(requests.get(url).text.replace("https://huggingface.co/wpmed/tinygrad-sd-f16/raw/main/bpe_simple_vocab_16e6.mjs", "./bpe_simple_vocab_16e6.mjs")) + if __name__ == "__main__": + fetch_dep(os.path.join(os.path.dirname(__file__), "clip_tokenizer.js"), "https://huggingface.co/wpmed/tinygrad-sd-f16/raw/main/clip_tokenizer.js") + fetch_dep(os.path.join(os.path.dirname(__file__), "bpe_simple_vocab_16e6.mjs"), "https://huggingface.co/wpmed/tinygrad-sd-f16/raw/main/bpe_simple_vocab_16e6.mjs") parser = argparse.ArgumentParser(description='Run Stable Diffusion', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--remoteweights', action='store_true', help="Use safetensors from Huggingface, or from local") args = parser.parse_args() @@ -94,12 +100,21 @@ if __name__ == "__main__": prg = "" + def fixup_code(code, key): + code = code.replace(key, 'main')\ + .replace("var INFINITY : f32;\n", "fn inf(a: f32) -> f32 { return a/0.0; }\n")\ + .replace("@group(0) @binding(0)", "")\ + .replace("INFINITY", "inf(1.0)") + + for i in range(1,9): code = code.replace(f"binding({i})", f"binding({i-1})") + return code + def compile_step(model, step: Step): run, special_names = jit_model(step, *step.input) functions, statements, bufs, _ = compile_net(run, special_names) state = get_state_dict(model) weights = {id(x.lazydata.base.realized): name for name, x in state.items()} - kernel_code = '\n\n'.join([f"const {key} = `{code.replace(key, 'main')}`;" for key, code in functions.items()]) + kernel_code = '\n\n'.join([f"const {key} = `{fixup_code(code, key)}`;" for key, code in functions.items()]) kernel_names = ', '.join([name for (name, _, _, _) in statements]) kernel_calls = '\n '.join([f"addComputePass(device, commandEncoder, piplines[{i}], [{', '.join(args)}], {global_size});" for i, (_name, args, global_size, _local_size) in enumerate(statements) ]) bufs = '\n '.join([f"const {name} = " + (f"createEmptyBuf(device, {size});" if _key not in weights else f"createWeightBuf(device, {size}, getTensorBuffer(safetensor, metadata['{weights[_key]}'], '{weights[_key]}'))") + ";" for name,(size,dtype,_key) in bufs.items()]) @@ -148,14 +163,14 @@ if __name__ == "__main__": if step.name == "diffusor": if args.remoteweights: - base_url = "https://huggingface.co/wpmed/tinygrad-sd-f16/resolve/main" + base_url = "https://huggingface.co/wpmed/stable-diffusion-f16-new/resolve/main" else: state = get_state_dict(model) safe_save(state, os.path.join(os.path.dirname(__file__), "net.safetensors")) - convert_f32_to_f16("./net.safetensors", "./net_conv.safetensors") - split_safetensor("./net_conv.safetensors") - os.remove("net.safetensors") - os.remove("net_conv.safetensors") + convert_f32_to_f16(os.path.join(os.path.dirname(__file__), "./net.safetensors"), os.path.join(os.path.dirname(__file__), "./net_conv.safetensors")) + split_safetensor(os.path.join(os.path.dirname(__file__), "./net_conv.safetensors")) + os.remove(os.path.join(os.path.dirname(__file__), "net.safetensors")) + os.remove(os.path.join(os.path.dirname(__file__), "net_conv.safetensors")) base_url = "." prekernel = f""" @@ -185,20 +200,6 @@ if __name__ == "__main__": counter++; }} - let allZero = true; - let out = safetensorParts[selectedPart].subarray(...correctedOffsets); - - for (let i = 0; i < out.length; i++) {{ - if (out[i] !== 0) {{ - allZero = false; - break; - }} - }} - - if (allZero) {{ - console.log("Error: weight '" + key + "' is all zero."); - }} - return safetensorParts[selectedPart].subarray(...correctedOffsets); }} diff --git a/examples/webgpu/stable_diffusion/f16_to_f32.js b/examples/webgpu/stable_diffusion/f16_to_f32.js deleted file mode 100644 index fb91ea0ea6..0000000000 --- a/examples/webgpu/stable_diffusion/f16_to_f32.js +++ /dev/null @@ -1,64 +0,0 @@ -const f16tof32 = ` -fn u16_to_f16(x: u32) -> f32 { - let sign = f32((x >> 15) & 0x1); - let exponent = f32((x >> 10) & 0x1F); - let fraction = f32(x & 0x3FF); - - let sign_multiplier = select(1.0, -1.0, sign == 1.0); - if (exponent == 0.0) { - return sign_multiplier * 6.103515625e-5 * (fraction / 1024.0); - } else { - return sign_multiplier * exp2(exponent - 15.0) * (1.0 + fraction / 1024.0); - } -} - -@group(0) @binding(0) var data0: array; -@group(0) @binding(1) var data1: array; -@compute @workgroup_size(256) fn main(@builtin(global_invocation_id) gid: vec3) { - let gidx = gid.x; - let outgidx = gidx*2; - - if (gidx >= arrayLength(&data0)) { - return; - } - - let oo = data0[gidx]; - let oo1 = (oo >> 16); - let oo2 = oo & 0xFFFFu; - - let f1 = u16_to_f16(oo2); - let f2 = u16_to_f16(oo1); - - data1[outgidx] = f1; - data1[outgidx + 1] = f2; -}`; - -window.f16tof32GPU = async(device, inf16) => { - const input = device.createBuffer({size: inf16.length, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST }); - const output = device.createBuffer({size: inf16.length*2, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST }); - - const gpuWriteBuffer = device.createBuffer({size: input.size, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE }); - const gpuReadBuffer = device.createBuffer({ size: output.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ }); - const commandEncoder = device.createCommandEncoder(); - await gpuWriteBuffer.mapAsync(GPUMapMode.WRITE); - - const alignedUint32View = new Uint32Array(inf16.buffer, inf16.byteOffset, inf16.length / 4); - new Uint32Array(gpuWriteBuffer.getMappedRange()).set(alignedUint32View); - - gpuWriteBuffer.unmap(); - commandEncoder.copyBufferToBuffer(gpuWriteBuffer, 0, input, 0, gpuWriteBuffer.size); - const pipeline = await device.createComputePipelineAsync({layout: "auto", compute: { module: device.createShaderModule({ code: f16tof32 }), entryPoint: "main" }}); - - addComputePass(device, commandEncoder, pipeline, [input, output], [Math.ceil(inf16.length/(4*256)), 1, 1]); - - commandEncoder.copyBufferToBuffer(output, 0, gpuReadBuffer, 0, output.size); - const gpuCommands = commandEncoder.finish(); - device.queue.submit([gpuCommands]); - - await gpuReadBuffer.mapAsync(GPUMapMode.READ); - const resultBuffer = new Float32Array(gpuReadBuffer.size/4); - resultBuffer.set(new Float32Array(gpuReadBuffer.getMappedRange())); - gpuReadBuffer.unmap(); - - return resultBuffer; -} diff --git a/examples/webgpu/stable_diffusion/index.html b/examples/webgpu/stable_diffusion/index.html index d2fb1dfbda..464c908f70 100644 --- a/examples/webgpu/stable_diffusion/index.html +++ b/examples/webgpu/stable_diffusion/index.html @@ -5,103 +5,213 @@ tinygrad has WebGPU - - + - -

WebGPU is not supported in this browser

-

StableDiffusion by tinygrad WebGPU

-
- +

WebGPU is not supported in this browser

+

StableDiffusion powered by tinygrad

+ + GitHub Logo + - +
+ - - - + -
- Downloading model - - + + + + +
+ Downloading model + + +
+ + + +
- -
- +