Bring back WebGPU (#7063)

* Start from andredaprato:webgpu-clean * Fix infs * inf wgsl function is not needed * Emulated ulong for threefry, more tests passing * Randomness tests passing * Update model export to support new changes in webgpu, efficientnet export works again * Simplify shift emulation in wgsl * Delete test file * Fix bigger than u32 u32 literal * Why was skip copies added here? * Python3.12 for webgpu tests * Fix model export syntax error * Get test ops passing with some skips * Fix lint * Much simpler shift * Run more tests * Timestamp queries are not supported in CI, so skip search tests * All fancy indexing passing * r is ctx * Run more dtype tests by using is_dtype_supported * Cleanup ulong shift rendering * UPat -> Pat, UOps -> Ops * Pat -> UPat * Refactor render_ushift if-else * Pattern to avoid ulong mul * Remove vals_dtype * is_nan trick + rewrite, test_isnan passing * Rewrite a * select(1, nan, gate) -> select(a, nan, gate) * No arg, just op * Support char, uchar, short, ushort * Run test_index_mnis now that we have uint8 * Fix pyling * Save 3 lines by using base Compiler * No more long emulation * Remove fixup_binops * No more external_local_bufx wgsl specific cstyle modif, use base extra_pm * Simpler, faster copyin/out * Skip some new tests that use long * Fix typo * copyout touchup * Save lines by using render_cast * WebGL is not supported in core, delete it from is_dtype_supported * More narrow test skips for some unary tests * TernaryOps, UnaryOps -> Ops * TinyGrad supports WebGPU * StableDiffusion demo: f16tof32 gpu is a lib, update UI * Packed load/store, no more scale_size, no core tinygrad changes * Rename copyin, copyout * Device -> dev * Fix lint * Pattern matcher rule for packed load/store * Refactor * Shorter packed load/store * this should fix lint * Fix mypy * SD compile script working * New SD webgpu UI * New default prompt * New SD weights * Fix title when webgpu not available * Run symbolic tests, simplify is_nan, use round_up * Show step time on UI * Bump minimum wgpu version to v0.19 * Fix latent --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
2026-01-08 22:48:25 -05:00 · 2024-11-26 05:26:40 +01:00
parent ff3f2a9c1a
commit 10618aba98
18 changed files with 659 additions and 402 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -351,46 +351,44 @@ jobs:
          export COMMIT_MESSAGE=$(git show -s --format=%B ${{ github.event.pull_request.head.sha }})
          cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

-  #testwebgpu:
-  #  name: WebGPU Tests
-  #  runs-on: macos-13
-  #  timeout-minutes: 20
-  #  steps:
-  #  - name: Checkout Code
-  #    uses: actions/checkout@v4
-  #  - name: Set up Python 3.11
-  #    uses: actions/setup-python@v5
-  #    with:
-  #      python-version: 3.11
-  #  - name: Cache python packages
-  #    uses: actions/cache@v4
-  #    with:
-  #      path: /Users/runner/Library/Python/3.11/lib/python/site-packages
-  #      key: webgpu-testing-user3-packages-${{ hashFiles('**/setup.py') }}
-  #  - name: Install Dependencies
-  #    run: pip install --user -e '.[webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-  #  - name: Cache downloads
-  #    uses: actions/cache@v4
-  #    with:
-  #      path: ~/Library/Caches/tinygrad/downloads/
-  #      key: downloads-cache-webgpu-${{ env.DOWNLOAD_CACHE_VERSION }}
-  #  - name: Check Device.DEFAULT (WEBGPU) and print some source
-  #    run: |
-  #      WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
-  #      WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
-    #- name: Run webgpu pytest
-    #  run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto
-  #  - name: Run selected webgpu tests
-  #    run: |
-  #      WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto test/test_ops.py test/test_dtype.py \
-  #      test/test_jit.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_linearizer.py \
-  #      test/test_linearizer_failures.py test/test_nn.py
-  #  - name: Build WEBGPU Efficientnet
-  #    run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.compile_efficientnet
-  #  - name: Install Puppeteer
-  #    run: npm install puppeteer
-  #  - name: Run WEBGPU Efficientnet
-  #    run: node test/web/test_webgpu.js
+  testwebgpu:
+    name: WebGPU Tests
+    runs-on: macos-14
+    timeout-minutes: 20
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.12
+    - name: Cache python packages
+      uses: actions/cache@v4
+      with:
+        path: /Users/runner/Library/Python/3.11/lib/python/site-packages
+        key: webgpu-testing-user3-packages-${{ hashFiles('**/setup.py') }}
+    - name: Install Dependencies
+      run: pip install --user -e '.[webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Cache downloads
+      uses: actions/cache@v4
+      with:
+        path: ~/Library/Caches/tinygrad/downloads/
+        key: downloads-cache-webgpu-${{ env.DOWNLOAD_CACHE_VERSION }}
+    - name: Check Device.DEFAULT (WEBGPU) and print some source
+      run: |
+        WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
+        WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+    - name: Build WEBGPU Efficientnet
+      run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python3 -m examples.compile_efficientnet
+    - name: Install Puppeteer
+      run: npm install puppeteer
+    - name: Run WEBGPU Efficientnet
+      run: node test/web/test_webgpu.js
+    - name: Run selected webgpu tests
+      run: |
+          WEBGPU=1 WGPU_BACKEND_TYPE=Metal python3 -m pytest test/test_assign.py test/test_arange.py test/test_const_folding.py test/test_dtype.py \
+          test/test_dtype_alu.py test/test_conv.py test/test_conv_shapetracker.py test/test_nn.py test/test_ops.py test/test_optim.py \
+          test/test_randomness.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_uops_stats.py test/test_uops.py --durations=20

  testmetal:
    name: Metal Tests
--- a/README.md
+++ b/README.md
@@ -88,6 +88,7 @@ tinygrad already supports numerous accelerators, including:
 - [x] [AMD](tinygrad/runtime/ops_amd.py)
 - [x] [NV](tinygrad/runtime/ops_nv.py)
 - [x] [QCOM](tinygrad/runtime/ops_qcom.py)
+- [x] [WEBGPU](tinygrad/runtime/ops_webgpu.py)

 And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.

--- a/examples/compile_efficientnet.py
+++ b/examples/compile_efficientnet.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from extra.models.efficientnet import EfficientNet
 from tinygrad.tensor import Tensor
-from tinygrad.nn.state import safe_save
+from tinygrad.nn.state import get_state_dict, safe_save, safe_load, load_state_dict
 from extra.export_model import export_model
 from tinygrad.helpers import getenv, fetch
 import ast
@@ -9,11 +9,15 @@ import ast
 if __name__ == "__main__":
  model = EfficientNet(0)
  model.load_from_pretrained()
+  dirname = Path(__file__).parent
+  # exporting a model that's loaded from safetensors doesn't work without loading in from safetensors first
+  # loading the state dict from a safetensor file changes the generated kernels
+  if getenv("WEBGPU") or getenv("WEBGL"):
+    safe_save(get_state_dict(model), (dirname / "net.safetensors").as_posix())
+    load_state_dict(model, safe_load(str(dirname / "net.safetensors")))
  mode = "clang" if getenv("CLANG", "") != "" else "webgpu" if getenv("WEBGPU", "") != "" else "webgl" if getenv("WEBGL", "") != "" else ""
  prg, inp_sizes, out_sizes, state = export_model(model, mode, Tensor.randn(1,3,224,224))
-  dirname = Path(__file__).parent
  if getenv("CLANG", "") == "":
-    safe_save(state, (dirname / "net.safetensors").as_posix())
    ext = "js" if getenv("WEBGPU", "") != "" or getenv("WEBGL", "") != "" else "json"
    with open(dirname / f"net.{ext}", "w") as text_file:
      text_file.write(prg)
--- a/examples/webgpu/stable_diffusion/compile.py
+++ b/examples/webgpu/stable_diffusion/compile.py
@@ -6,7 +6,7 @@ from tinygrad.tensor import Tensor
 from tinygrad import Device
 from tinygrad.helpers import fetch
 from typing import NamedTuple, Any, List
-from pathlib import Path
+import requests
 import argparse
 import numpy as np

@@ -60,16 +60,22 @@ def split_safetensor(fn):
  cur_pos = 0

  for i, end_pos in enumerate(part_end_offsets):
-    with open(f'./net_part{i}.safetensors', "wb+") as f:
+    with open(os.path.join(os.path.dirname(__file__), f'./net_part{i}.safetensors'), "wb+") as f:
      f.write(net_bytes[cur_pos:end_pos])
      cur_pos = end_pos

-  with open(f'./net_textmodel.safetensors', "wb+") as f:
+  with open(os.path.join(os.path.dirname(__file__), f'./net_textmodel.safetensors'), "wb+") as f:
    f.write(net_bytes[text_model_start+8+json_len:])

  return part_end_offsets

+def fetch_dep(file, url):
+  with open(file, "w", encoding="utf-8") as f:
+    f.write(requests.get(url).text.replace("https://huggingface.co/wpmed/tinygrad-sd-f16/raw/main/bpe_simple_vocab_16e6.mjs", "./bpe_simple_vocab_16e6.mjs"))
+
 if __name__ == "__main__":
+  fetch_dep(os.path.join(os.path.dirname(__file__), "clip_tokenizer.js"), "https://huggingface.co/wpmed/tinygrad-sd-f16/raw/main/clip_tokenizer.js")
+  fetch_dep(os.path.join(os.path.dirname(__file__), "bpe_simple_vocab_16e6.mjs"), "https://huggingface.co/wpmed/tinygrad-sd-f16/raw/main/bpe_simple_vocab_16e6.mjs")
  parser = argparse.ArgumentParser(description='Run Stable Diffusion', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--remoteweights', action='store_true', help="Use safetensors from Huggingface, or from local")
  args = parser.parse_args()
@@ -94,12 +100,21 @@ if __name__ == "__main__":

  prg = ""

+  def fixup_code(code, key):
+    code = code.replace(key, 'main')\
+      .replace("var<uniform> INFINITY : f32;\n", "fn inf(a: f32) -> f32 { return a/0.0; }\n")\
+      .replace("@group(0) @binding(0)", "")\
+      .replace("INFINITY", "inf(1.0)")
+
+    for i in range(1,9): code = code.replace(f"binding({i})", f"binding({i-1})")
+    return code
+
  def compile_step(model, step: Step):
    run, special_names = jit_model(step, *step.input)
    functions, statements, bufs, _ = compile_net(run, special_names)
    state = get_state_dict(model)
    weights = {id(x.lazydata.base.realized): name for name, x in state.items()}
-    kernel_code = '\n\n'.join([f"const {key} = `{code.replace(key, 'main')}`;" for key, code in functions.items()])
+    kernel_code = '\n\n'.join([f"const {key} = `{fixup_code(code, key)}`;" for key, code in functions.items()])
    kernel_names = ', '.join([name for (name, _, _, _) in statements])
    kernel_calls = '\n        '.join([f"addComputePass(device, commandEncoder, piplines[{i}], [{', '.join(args)}], {global_size});" for i, (_name, args, global_size, _local_size) in enumerate(statements) ])
    bufs =  '\n    '.join([f"const {name} = " + (f"createEmptyBuf(device, {size});" if _key not in weights else f"createWeightBuf(device, {size}, getTensorBuffer(safetensor, metadata['{weights[_key]}'], '{weights[_key]}'))") + ";"  for name,(size,dtype,_key) in bufs.items()])
@@ -148,14 +163,14 @@ if __name__ == "__main__":

    if step.name == "diffusor":
      if args.remoteweights:
-        base_url = "https://huggingface.co/wpmed/tinygrad-sd-f16/resolve/main"
+        base_url = "https://huggingface.co/wpmed/stable-diffusion-f16-new/resolve/main"
      else:
        state = get_state_dict(model)
        safe_save(state, os.path.join(os.path.dirname(__file__), "net.safetensors"))
-        convert_f32_to_f16("./net.safetensors", "./net_conv.safetensors")
-        split_safetensor("./net_conv.safetensors")
-        os.remove("net.safetensors")
-        os.remove("net_conv.safetensors")
+        convert_f32_to_f16(os.path.join(os.path.dirname(__file__), "./net.safetensors"), os.path.join(os.path.dirname(__file__), "./net_conv.safetensors"))
+        split_safetensor(os.path.join(os.path.dirname(__file__), "./net_conv.safetensors"))
+        os.remove(os.path.join(os.path.dirname(__file__), "net.safetensors"))
+        os.remove(os.path.join(os.path.dirname(__file__), "net_conv.safetensors"))
        base_url = "."

  prekernel = f"""
@@ -185,20 +200,6 @@ if __name__ == "__main__":
      counter++;
    }}

-    let allZero = true;
-    let out = safetensorParts[selectedPart].subarray(...correctedOffsets);
-
-    for (let i = 0; i < out.length; i++) {{
-        if (out[i] !== 0) {{
-            allZero = false;
-            break;
-        }}
-    }}
-
-    if (allZero) {{
-        console.log("Error: weight '" + key + "' is all zero.");
-    }}
-
    return safetensorParts[selectedPart].subarray(...correctedOffsets);
  }}

--- a/examples/webgpu/stable_diffusion/f16_to_f32.js
+++ b/examples/webgpu/stable_diffusion/f16_to_f32.js
@@ -1,64 +0,0 @@
-const f16tof32 = `
-fn u16_to_f16(x: u32) -> f32 {
-    let sign = f32((x >> 15) & 0x1);
-    let exponent = f32((x >> 10) & 0x1F);
-    let fraction = f32(x & 0x3FF);
-
-    let sign_multiplier = select(1.0, -1.0, sign == 1.0);
-    if (exponent == 0.0) {
-        return sign_multiplier * 6.103515625e-5 * (fraction / 1024.0);
-    } else {
-        return sign_multiplier * exp2(exponent - 15.0) * (1.0 + fraction / 1024.0);
-    }
-}
-
-@group(0) @binding(0) var<storage,read_write> data0: array<u32>;
-@group(0) @binding(1) var<storage,read_write> data1: array<f32>;
-@compute @workgroup_size(256) fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let gidx = gid.x;
-    let outgidx = gidx*2;
-
-    if (gidx >= arrayLength(&data0)) {
-        return;
-    }
-
-    let oo = data0[gidx];
-    let oo1 = (oo >> 16);
-    let oo2 = oo & 0xFFFFu;
-
-    let f1 = u16_to_f16(oo2);
-    let f2 = u16_to_f16(oo1);
-    
-    data1[outgidx] = f1;
-    data1[outgidx + 1] = f2;
-}`;
-
-window.f16tof32GPU = async(device, inf16) => {
-    const input = device.createBuffer({size: inf16.length, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST });
-    const output = device.createBuffer({size: inf16.length*2, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST });
-
-    const gpuWriteBuffer = device.createBuffer({size: input.size, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE });
-    const gpuReadBuffer = device.createBuffer({ size: output.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ });
-    const commandEncoder = device.createCommandEncoder();
-    await gpuWriteBuffer.mapAsync(GPUMapMode.WRITE);
-
-    const alignedUint32View = new Uint32Array(inf16.buffer, inf16.byteOffset, inf16.length / 4);
-    new Uint32Array(gpuWriteBuffer.getMappedRange()).set(alignedUint32View);
-
-    gpuWriteBuffer.unmap();
-    commandEncoder.copyBufferToBuffer(gpuWriteBuffer, 0, input, 0, gpuWriteBuffer.size);
-    const pipeline = await device.createComputePipelineAsync({layout: "auto", compute: { module: device.createShaderModule({ code: f16tof32 }), entryPoint: "main" }});
-
-    addComputePass(device, commandEncoder, pipeline, [input, output], [Math.ceil(inf16.length/(4*256)), 1, 1]);
-
-    commandEncoder.copyBufferToBuffer(output, 0, gpuReadBuffer, 0, output.size);
-    const gpuCommands = commandEncoder.finish();
-    device.queue.submit([gpuCommands]);
-
-    await gpuReadBuffer.mapAsync(GPUMapMode.READ);
-    const resultBuffer = new Float32Array(gpuReadBuffer.size/4);
-    resultBuffer.set(new Float32Array(gpuReadBuffer.getMappedRange()));
-    gpuReadBuffer.unmap();
-
-    return resultBuffer;
-}
--- a/examples/webgpu/stable_diffusion/index.html
+++ b/examples/webgpu/stable_diffusion/index.html
@@ -5,103 +5,213 @@
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>tinygrad has WebGPU</title>
-
    <style>
-        body {
-            font-family: 'Arial', sans-serif;
-            text-align: center;
-            padding: 30px;
+        /* General Reset */
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
        }

-        a {
-            text-decoration: none;
-            color: #4A90E2;
+        body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            background: #f4f7fb;
+            color: #333;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            min-height: 100vh;
+            padding: 20px;
+            flex-direction: column;
+            text-align: center;
        }

        h1 {
-            font-size: 36px;
-            font-weight: normal;
+            font-size: 2.2rem;
            margin-bottom: 20px;
+            color: #4A90E2;
+        }
+
+        #wgpuError {
+            color: red;
+            font-size: 1.2rem;
+            margin-top: 20px;
+            display: none;
+        }
+
+        #sdTitle {
+            font-size: 1.5rem;
+            margin-bottom: 30px;
        }

        #mybox {
-            display: flex;
-            flex-direction: column;
-            align-items: center;
-            gap: 20px;
-            width: 50%;
-            margin: 0 auto;
+            background: #ffffff;
+            padding: 15px;
+            border-radius: 10px;
+            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+            width: 120%;
+            max-width: 550px;
+            margin-bottom: 10px;
        }

-        #promptText, #stepRange, #btnRunNet, #guidanceRange {
-            font-size: 18px;
+        input[type="text"] {
            width: 100%;
+            padding: 12px;
+            margin-bottom: 20px;
+            font-size: 1rem;
+            border: 1px solid #ccc;
+            border-radius: 8px;
+            outline: none;
+            transition: all 0.3s ease;
        }

-        #result {
-            font-size: 48px;
-        }
-
-        #time {
-            font-size: 16px;
-            color: grey;
-        }
-
-        canvas {
-            margin-top: 20px;
-            border: 1px solid #000; 
+        input[type="text"]:focus {
+            border-color: #4A90E2;
        }

        label {
            display: flex;
+            justify-content: space-between;
+            font-size: 1rem;
+            margin-bottom: 15px;
            align-items: center;
-            gap: 10px;
-            width: 100%;
        }
-        
-        #sliderValue {
-            margin-right: 10px;/
+
+        input[type="range"] {
+            width: 100%;
+            margin-left: 10px;
+            -webkit-appearance: none;
+            appearance: none;
+            height: 8px;
+            border-radius: 4px;
+            background: #ddd;
+            outline: none;
+            transition: background 0.3s ease;
+        }
+
+        input[type="range"]:focus {
+            background: #4A90E2;
+        }
+
+        #stepRange,
+        #guidanceRange {
+            width: 80%;
+        }
+
+        span {
+            font-size: 1.1rem;
+            font-weight: 600;
+            color: #333;
+        }
+
+        input[type="button"] {
+            padding: 12px 25px;
+            background-color: #4A90E2;
+            color: #fff;
+            font-size: 1.2rem;
+            border: none;
+            border-radius: 8px;
+            cursor: pointer;
+            transition: background-color 0.3s ease;
+            width: 100%;
+            margin-bottom: 20px;
+        }
+
+        input[type="button"]:disabled {
+            background-color: #ccc;
+            cursor: not-allowed;
+        }
+
+        input[type="button"]:hover {
+            background-color: #357ABD;
+        }
+
+        #divModelDl, #divStepProgress {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 20px;
+        }
+
+        #modelDlProgressBar,
+        #progressBar {
+            width: 80%;
+            height: 12px;
+            border-radius: 6px;
+            background-color: #e0e0e0;
+        }
+
+        #modelDlProgressBar::-webkit-progress-bar,
+        #progressBar::-webkit-progress-bar {
+            border-radius: 6px;
+        }
+
+        #modelDlProgressValue, #progressFraction {
+            font-size: 1rem;
+            font-weight: 600;
+            color: #333;
+        }
+
+        canvas {
+            max-width: 100%;
+            max-height: 450px;
+            margin-top: 10px;
+            border-radius: 8px;
+            border: 1px solid #ddd;
        }
    </style>

    <script type="module">
-        import ClipTokenizer from 'https://softwired.nyc3.cdn.digitaloceanspaces.com/sd/clip_tokenizer.js';
+        import ClipTokenizer from './clip_tokenizer.js';
        window.clipTokenizer = new ClipTokenizer();
    </script>
-    <script src="./f16_to_f32.js"></script>
+    <script type="module">
+        import { f16tof32GPU } from 'https://unpkg.com/f16-to-f32-gpu@0.1.0/src/index.js';
+        window.f16tof32GPU = f16tof32GPU;
+    </script>
    <script src="./net.js"></script>
 </head>
-
 <body>
-<h1 id="wgpuError" style="display: none; color: red;">WebGPU is not supported in this browser</h1>
-<h1 id="sdTitle">StableDiffusion by <a href="https://github.com/tinygrad/tinygrad" target="_blank">tinygrad</a> WebGPU</h1>
-<div id="mybox">
-    <input id="promptText" type="text" placeholder="Enter your prompt here" value="a horse sized cat eating a bagel">
+    <h1 id="wgpuError" style="display: none;">WebGPU is not supported in this browser</h1>
+    <h1 id="sdTitle">StableDiffusion powered by <a href="https://github.com/tinygrad/tinygrad" target="_blank" style="color: #4A90E2;">tinygrad</a></h1>
+    <a href="https://github.com/tinygrad/tinygrad" target="_blank" style="position: absolute; top: 20px; right: 20px;">
+        <img src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg"
+             alt="GitHub Logo"
+             style="width: 32px; height: 32px;">
+    </a>
    
-    <label>
-        Steps: <span id="stepValue">8</span>
-        <input id="stepRange" type="range" min="5" max="20" value="8" step="1">
-    </label>
+    <div id="mybox">
+        <input id="promptText" type="text" placeholder="Enter your prompt here" value="a human standing on the surface of mars">

-    <label>
-        Guidance: <span id="guidanceValue">7.5</span>
-        <input id="guidanceRange" type="range" min="3" max="15" value="7.5" step="0.1">
-    </label>
-    
-    <input id="btnRunNet" type="button" value="Run" disabled>
+        <label>
+            Steps: <span id="stepValue">9</span>
+            <input id="stepRange" type="range" min="5" max="20" value="9" step="1">
+        </label>

-    <div id="divModelDl" style="display: flex; align-items: center; width: 100%; gap: 10px;">
-        <span id="modelDlTitle">Downloading model</span>
-        <progress id="modelDlProgressBar" value="0" max="100" style="flex-grow: 1;"></progress>
-        <span id="modelDlProgressValue"></span>
+        <label>
+            Guidance: <span id="guidanceValue">8.0</span>
+            <input id="guidanceRange" type="range" min="3" max="15" value="8.0" step="0.1">
+        </label>
+
+        <input id="btnRunNet" type="button" value="Run" disabled>
+
+        <div id="divModelDl" style="display: flex; align-items: center; width: 100%; gap: 10px;">
+            <span id="modelDlTitle">Downloading model</span>
+            <progress id="modelDlProgressBar" value="0" max="100" style="flex-grow: 1;"></progress>
+            <span id="modelDlProgressValue"></span>
+        </div>
+
+        <div id="divStepProgress" style="display: none; align-items: center; width: 100%; gap: 10px;">
+            <progress id="progressBar" value="0" max="100" style="flex-grow: 1;"></progress>
+            <span id="progressFraction"></span>
+        </div>
+
+        <div id="divStepTime" style="display: none; align-items: center; width: 100%; gap: 10px;">
+            <span id="stepTimeValue">0 ms</span>
+        </div>
    </div>

-    <div id="divStepProgress" style="display: none; align-items: center; width: 100%; gap: 10px;">
-        <progress id="progressBar" value="0" max="100" style="flex-grow: 1;"></progress>
-        <span id="progressFraction"></span>
-    </div>
-</div>
-<canvas id="canvas" width="512" height="512"></canvas>
+    <canvas id="canvas" width="512" height="512"></canvas>

 <script>
    function initDb() {
@@ -129,30 +239,36 @@
    }

    function saveTensorToDb(db, id, tensor) {
-        return new Promise((resolve, reject) => {
-            if (db == null) {
-                resolve(null);
+        return readTensorFromDb(db, id).then((result) => {
+            if (!result) {
+                new Promise((resolve, reject) => {
+                    if (db == null) {
+                        resolve(null);
+                    }
+
+                    const transaction = db.transaction(['tensors'], 'readwrite');
+                    const store = transaction.objectStore('tensors');
+                    const request = store.put({ id: id, content: tensor });
+
+                    transaction.onabort = (event) => {
+                        console.log("Transaction error while saving tensor: " + event.target.error);
+                        resolve(null);
+                    };
+
+                    request.onsuccess = () => {
+                        console.log('Tensor saved successfully.');
+                        resolve();
+                    };
+
+                    request.onerror = (event) => {
+                        console.error('Tensor save failed:', event.target.error);
+                        resolve(null);
+                    };
+                });
+            } else {
+                return null;
            }
-
-            const transaction = db.transaction(['tensors'], 'readwrite');
-            const store = transaction.objectStore('tensors');
-            const request = store.put({ id: id, content: tensor });
-
-            transaction.onabort = (event) => {
-                console.log("Transaction error while saving tensor: " + event.target.error);
-                resolve(null);
-            };
-
-            request.onsuccess = () => {
-                console.log('Tensor saved successfully.');
-                resolve();
-            };
-
-            request.onerror = (event) => {
-                console.error('Tensor save failed:', event.target.error);
-                resolve(null);
-            };
-        });
+        }).catch(()=> null);
    }

    function readTensorFromDb(db, id) {
@@ -173,10 +289,8 @@
            request.onsuccess = (event) => {
                const result = event.target.result;
                if (result) {
-                    console.log("Cache hit: " + id);
                    resolve(result);
                } else {
-                    console.log("Cache miss: " + id);
                    resolve(null);
                }
            };
@@ -190,7 +304,7 @@

    window.addEventListener('load', async function() {
        if (!navigator.gpu) {
-            document.getElementById("wgpuError").style.display = "";
+            document.getElementById("wgpuError").style.display = "block";
            document.getElementById("sdTitle").style.display = "none";
            return;
        }
@@ -247,6 +361,18 @@
            let totalSize = 0;
            let partSize = {};

+            const getPart = async(key) => {
+                let part = await readTensorFromDb(db, key);
+
+                if (part) {
+                    console.log(`Cache hit: ${key}`);
+                    return Promise.resolve(part.content);
+                } else {
+                    console.log(`Cache miss: ${key}`);
+                    return getProgressDlForPart(`${window.MODEL_BASE_URL}/${key}.safetensors`, progressCallback);
+                }
+            }
+
            const progressCallback = (part, loaded, total) => {
                totalLoaded += loaded;

@@ -258,63 +384,31 @@
                progress(totalLoaded, totalSize);
            };

-            let combinedBuffer = await readTensorFromDb(db, "net.f16");
-            let textModelU8 = await readTensorFromDb(db, "net.text");
-            let textModelFetched = false;
+            let netKeys = ["net_part0", "net_part1", "net_part2", "net_part3", "net_textmodel"];
+            let buffers = await Promise.all(netKeys.map(key => getPart(key)));

-            if (combinedBuffer == null) {
-                let dlParts = [
-                    getProgressDlForPart(window.MODEL_BASE_URL + '/net_part0.safetensors', progressCallback),
-                    getProgressDlForPart(window.MODEL_BASE_URL + '/net_part1.safetensors', progressCallback),
-                    getProgressDlForPart(window.MODEL_BASE_URL + '/net_part2.safetensors', progressCallback),
-                    getProgressDlForPart(window.MODEL_BASE_URL + '/net_part3.safetensors', progressCallback)
-                ];
-
-                if (textModelU8 == null) {
-                    dlParts.push(getProgressDlForPart(window.MODEL_BASE_URL + '/net_textmodel.safetensors', progressCallback));
+            // Combine everything except for text model, since that's already f32
+            const totalLength = buffers.reduce((acc, buffer, index, array) => {
+                if (index < 4) {
+                    return acc + buffer.byteLength;
+                } else {
+                    return acc;
                }
+                }, 0
+            );

-                let buffers = await Promise.all(dlParts);
-
-                // Combine everything except for text model, since that's alreafy f32
-                const totalLength = buffers.reduce((acc, buffer, index, array) => {
-                    if (index < 4) {
-                        return acc + buffer.byteLength;
-                    } else {
-                        return acc;
-                    }
-                    }, 0
-                );
-
-                combinedBuffer = new Uint8Array(totalLength);
-
-                let offset = 0;
-                buffers.forEach((buffer, index) => {
-                    if (index < 4) {
-                        combinedBuffer.set(new Uint8Array(buffer), offset);
-                        offset += buffer.byteLength;
-                        buffer = null;
-                    }
-                });
-
-                await saveTensorToDb(db, "net.f16", combinedBuffer);
-
-                if (textModelU8 == null) {
-                    textModelFetched = true;
-                    textModelU8 = new Uint8Array(buffers[4]);
-                    await saveTensorToDb(db, "net.text", textModelU8);
+            combinedBuffer = new Uint8Array(totalLength);
+            let offset = 0;
+            buffers.forEach((buffer, index) => {
+                saveTensorToDb(db, netKeys[index], new Uint8Array(buffer));
+                if (index < 4) {
+                    combinedBuffer.set(new Uint8Array(buffer), offset);
+                    offset += buffer.byteLength;
+                    buffer = null;
                }
-            } else {
-                combinedBuffer = combinedBuffer.content;
-            }
-
-            if (textModelU8 == null) {
-                textModelU8 = new Uint8Array(await getProgressDlForPart(window.MODEL_BASE_URL + '/net_textmodel.safetensors', progressCallback));
-                await saveTensorToDb(db, "net.text", textModelU8);
-            } else if (!textModelFetched) {
-                textModelU8 = textModelU8.content;
-            }
+            });

+            let textModelU8 = new Uint8Array(buffers[4]);
            document.getElementById("modelDlTitle").innerHTML = "Decompressing model";

            const textModelOffset = 3772703308;
@@ -346,16 +440,7 @@
                let chunkStartF16 = 8 + metadataLength + (decodeChunkSize * i);
                let chunkEndF16 = chunkStartF16 + decodeChunkSize;
                let chunk = combinedBuffer.subarray(chunkStartF16, chunkEndF16);
-
-                if (chunk.byteLength %4 != 0) {
-                    const paddingBytes = 4 - (chunk.byteLength % 4);
-                    const alignedBuffer = new ArrayBuffer(chunk.byteLength + paddingBytes);
-                    const alignedView = new Uint8Array(alignedBuffer);
-                    alignedView.set(new Uint8Array(chunk));
-                    chunk = alignedView;
-                }
-
-                let result = await f16tof32GPU(device, chunk);
+                let result = await f16tof32GPU(chunk);
                let resultUint8 = new Uint8Array(result.buffer);
                let chunkStartF32 = 8 + metadataLength + (decodeChunkSize * i * 2);
                let chunkEndF32 = chunkStartF32 + resultUint8.byteLength;
@@ -421,7 +506,7 @@
            document.getElementById("btnRunNet").disabled = false;
        }

-        function runStableDiffusion(prompt, steps, guidance) {
+        function runStableDiffusion(prompt, steps, guidance, showStep) {
            return new Promise(async (resolve, reject) => {
                let context = await timer(() => nets["textModel"](clipTokenizer.encodeForCLIP(prompt)));
                let unconditional_context = await timer(() => nets["textModel"](clipTokenizer.encodeForCLIP("")));
@@ -456,8 +541,16 @@
                
                for (let i = timesteps.length - 1; i >= 0; i--) {
                    let timestep = new Float32Array([timesteps[i]]);
-                    let x_prev = await timer(() => nets["diffusor"](unconditional_context, context, latent, timestep, new Float32Array([alphas[i]]), new Float32Array([alphas_prev[i]]), new Float32Array([guidance])));
+                    let start = performance.now()
+                    let x_prev = await nets["diffusor"](unconditional_context, context, latent, timestep, new Float32Array([alphas[i]]), new Float32Array([alphas_prev[i]]), new Float32Array([guidance]));
+                    document.getElementById("divStepTime").style.display = "block";
+                    document.getElementById("stepTimeValue").innerText = `${(performance.now() - start).toFixed(1)} ms / step`
                    latent = x_prev;
+
+                    if (showStep != null) {
+                        showStep(await nets["decoder"](latent));
+                    }
+
                    document.getElementById("progressBar").value = ((steps - i) / steps) * 100
                    document.getElementById("progressFraction").innerHTML = (steps - i) + "/" + steps
                }
@@ -466,27 +559,37 @@
            });
        }

+        function renderImage(e, image) {
+            let pixels = []
+            let pixelCounter = 0
+
+            for (let j = 0; j < 512; j++) {
+                for (let k = 0; k < 512; k++) {
+                    pixels.push(image[pixelCounter])
+                    pixels.push(image[pixelCounter+1])
+                    pixels.push(image[pixelCounter+2])
+                    pixels.push(255)
+                    pixelCounter += 3
+                }
+            }
+
+            ctx.putImageData(new ImageData(new Uint8ClampedArray(pixels), 512, 512), 0, 0);
+            e.target.disabled = false;
+        }
+
        document.getElementById("btnRunNet").addEventListener("click", function(e) {
            e.target.disabled = true;
+            const canvas = document.getElementById("canvas");
+            ctx.clearRect(0, 0, canvas.width, canvas.height);

-            runStableDiffusion(document.getElementById("promptText").value, document.getElementById("stepRange").value, document.getElementById("guidanceRange").value).then((image) => {
-                let pixels = []
-                let pixelCounter = 0
-
-                for (let j = 0; j < 512; j++) {
-                    for (let k = 0; k < 512; k++) {
-                        pixels.push(image[pixelCounter])
-                        pixels.push(image[pixelCounter+1])
-                        pixels.push(image[pixelCounter+2])
-                        pixels.push(255)
-                        pixelCounter += 3
-                    }
-                }
-                
-                ctx.putImageData(new ImageData(new Uint8ClampedArray(pixels), 512, 512), 0, 0);
-                console.log(image);
-                console.log("Success");
-                e.target.disabled = false;
+            runStableDiffusion(
+                document.getElementById("promptText").value,
+                document.getElementById("stepRange").value,
+                document.getElementById("guidanceRange").value,
+                // Decode at each step
+                null
+            ).then((image) => {
+                renderImage(e, image);
            });
        }, false);

--- a/extra/backends/cstyle.py
+++ b/extra/backends/cstyle.py
@@ -1,39 +1,3 @@
-# TODO: how much of this can be merged with above?
-class WGSLLanguage(CStyleLanguage):
-  code_for_workitem = {"g": lambda x: f"i32(gindex.{'xyz'[x]})", "l": lambda x: f"i32(lindex.{'xyz'[x]})"}
-  size_prefix = "let"
-  barrier="workgroupBarrier();"
-  generic_var_prefix = "var "
-  external_local_bufs = True
-  code_for_op = { **CStyleLanguage().code_for_op,
-                 BinaryOps.CMPLT: lambda x,y,dtype: f"f32({x}<{y})", BinaryOps.CMPEQ: lambda x,y,dtype: f"f32({x}=={y})",
-                 TernaryOps.MULACC: lambda x,y,z,dtype: f"fma({x},{y},{z})", TernaryOps.WHERE: lambda a,b,c,dtype: f"select({c},{b},bool({a}))" }
-  # HACK: write bool as f32
-  type_map = {dtypes.float: "f32", dtypes.half: "f16", dtypes.int32: "i32", dtypes.uint32: "u32", dtypes.bool: "f32"}
-
-  def render_local(self, name: str, dtype:DType, size: int): return f"var<workgroup> {name}: array<{self.type_map[dtype]},{size}>;"
-
-  def render_const(self, x:Union[float,int], var_dtype) -> str:
-    if math.isnan(x): return "nan()"
-    elif math.isinf(x): return ("-" if x < 0 else "") + "inf(1.0)"
-    return f"({super().render_const(x, var_dtype)})"
-
-  def render_if(self, cond: str): return f"if (bool({cond})) {{"
-
-  def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,DType]], local_size:List[int], prekernel:List[str]) -> str:
-    local_size = local_size[::-1] if local_size else [1]
-    bind_it = iter(range(len(bufs)))
-    prg = "fn nan() -> f32 { let bits = 0xffffffffu; return bitcast<f32>(bits); }\nfn inf(a: f32) -> f32 { return a/0.0; }\n"
-    prg += "\n".join(prekernel+[f"@group(0) @binding({next(bind_it)}) {'var<storage,read_write>' if isinstance(dtype, PtrDType) else 'var<uniform>'} {name}: {f'array<{self.type_map[dtype]}>' if isinstance(dtype, PtrDType) else 'i32'};" for name,dtype in bufs])  # noqa: E501
-    prg += f"\n@compute @workgroup_size({','.join([str(x) for x in local_size])}) fn {function_name}(@builtin(workgroup_id) gindex: vec3<u32>, @builtin(local_invocation_id) lindex: vec3<u32>) {{\n" + "\n".join(kernel) + "\n}"  # noqa: E501
-    return prg
-
-  def render_cast(self, x:List[str], var_dtype:DType, bitcast=False) -> str:
-    if self.type_map[var_dtype]: return f"bitcast<{self.type_map[var_dtype]}>({x[0]})" if bitcast else f"{self.type_map[var_dtype]}({x[0]})"
-    raise NotImplementedError(f"no cast for {var_dtype}")
-WGSLRenderer = functools.partial(uops_to_cstyle, WGSLLanguage())
-
-
 class GLSLLanguage(CStyleLanguage):
  type_map = {dtypes.float: "float", dtypes.half: "float", dtypes.int32: "int", dtypes.uint32: "uint", dtypes.bool: "bool"}
  sampler_prefix = {dtypes.float64: "d", dtypes.float: "", dtypes.half: "", dtypes.int32: "i", dtypes.uint32: "u", dtypes.bool: "i"}
--- a/extra/backends/ops_webgpu.py
+++ b/extra/backends/ops_webgpu.py
@@ -1,40 +0,0 @@
-from wgpu.utils.device import get_default_device
-from tinygrad.device import Compiled, Allocator, CompilerOptions
-from tinygrad.renderer.cstyle import WGSLRenderer
-import wgpu
-
-wgpu_device = get_default_device()
-def create_uniform(val: int) -> wgpu.GPUBuffer:
-  buf = wgpu_device.create_buffer(size=4, usage=wgpu.BufferUsage.UNIFORM | wgpu.BufferUsage.COPY_DST)
-  wgpu_device.queue.write_buffer(buf, 0, val.to_bytes(4, "little"))
-  return buf
-
-class WebGPUProgram:
-  def __init__(self, name:str, lib:bytes):
-    self.name, self.lib, self.prg = name, lib, wgpu_device.create_shader_module(code=lib)   # NOTE: this is the compiler
-  def __call__(self, *bufs, global_size, local_size, vals=(), wait=False):
-    assert len(bufs) <= 8, "WEBGPU only supports 8 buffers"
-    binding_layouts = [{"binding": i, "visibility": wgpu.ShaderStage.COMPUTE, "buffer": {"type": wgpu.BufferBindingType.uniform if i >= len(bufs) else wgpu.BufferBindingType.storage }} for i in range(len(bufs)+len(vals))]  # noqa: E501
-    bindings = [{"binding": i, "resource": {"buffer": create_uniform(x) if i >= len(bufs) else x, "offset": 0, "size": 4 if i >= len(bufs) else x.size}} for i,x in enumerate(bufs+vals)]  # noqa: E501
-    bind_group_layout = wgpu_device.create_bind_group_layout(entries=binding_layouts)
-    pipeline_layout = wgpu_device.create_pipeline_layout(bind_group_layouts=[bind_group_layout])
-    bind_group = wgpu_device.create_bind_group(layout=bind_group_layout, entries=bindings)
-    compute_pipeline = wgpu_device.create_compute_pipeline(layout=pipeline_layout,compute={"module": self.prg, "entry_point": self.name},)
-    command_encoder = wgpu_device.create_command_encoder()
-    compute_pass = command_encoder.begin_compute_pass()
-    compute_pass.set_pipeline(compute_pipeline)
-    compute_pass.set_bind_group(0, bind_group, [], 0, 999999) # last 2 not used
-    compute_pass.dispatch_workgroups(*global_size)  # x y z
-    compute_pass.end()
-    wgpu_device.queue.submit([command_encoder.finish()])
-
-class WebGpuAllocator(Allocator):
-  def _alloc(self, size: int):
-    return wgpu_device.create_buffer(size=size, usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_DST | wgpu.BufferUsage.COPY_SRC)
-  def _copyin(self, dest, src: memoryview): wgpu_device.queue.write_buffer(dest, 0, src)
-  def _copyout(self, dest, src: memoryview): dest[:] = wgpu_device.queue.read_buffer(src, 0)    # TODO: remove this copy
-
-class WebGpuDevice(Compiled):
-  def __init__(self, device:str):
-    super().__init__(WebGpuAllocator(), CompilerOptions(device="WEBGPU", supports_float4=False, local_max=[256, 256, 64],
-                                                          global_max=[65535, 65535, 65535]), WGSLRenderer, lambda x: x, WebGPUProgram)
--- a/extra/export_model.py
+++ b/extra/export_model.py
@@ -249,7 +249,14 @@ def export_model_webgl(functions, statements, bufs, bufs_to_save, weight_names,
 def export_model_webgpu(functions, statements, bufs, bufs_to_save, weight_names, input_names, output_names) -> Tuple[str,int,int]:
  kernel_code = '\n\n'.join([f"const {key} = `{code.replace(key, 'main')}`;" for key, code in functions.items()])
  kernel_names = ', '.join([name for (name, _args, _global_size, _local_size) in statements])
-  kernel_calls = '\n        '.join([f"addComputePass(device, commandEncoder, piplines[{i}], [{', '.join(args)}], {global_size});" for i, (_name, args, global_size, _local_size) in enumerate(statements) ])
+  create_bind_group_layouts = ",".join([
+    "device.createBindGroupLayout({{entries: [{{binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: {{ type: 'uniform' }}}}, {}]}})".format(
+        ",".join([f"{{binding: {argIdx+1}, visibility: GPUShaderStage.COMPUTE, buffer: {{ type: 'storage' }} }}" for argIdx, _ in enumerate(args)])
+    )
+    for i, (_name, args, global_size, _local_size) in enumerate(statements)
+  ])
+  layouts = f"const layouts=[{create_bind_group_layouts}]"
+  kernel_calls = '\n        '.join([f"addComputePass(device, commandEncoder, pipelines[{i}], layouts[{i}], infinityBuf, [{', '.join(args)}], {global_size});" for i, (_name, args, global_size, _local_size) in enumerate(statements) ])
  _bufs =  '\n    '.join([f"const {name} = " + (f"createEmptyBuf(device, {size});" if _key not in weight_names else f"createWeightBuf(device, {size}, getTensorBuffer(safetensor, metadata['{weight_names[_key]}']))") + ";"  for name,(size,dtype,_key) in bufs.items()])
  gpu_write_bufs =  '\n    '.join([f"const gpuWriteBuffer{i} = device.createBuffer({{size:{input_name}.size, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE }});" for i,input_name in enumerate(input_names)])
  input_writers = '\n    '.join([f"await gpuWriteBuffer{i}.mapAsync(GPUMapMode.WRITE);\n        new Float32Array(gpuWriteBuffer{i}.getMappedRange()).set(" + f'_{inp_name});' + f"\n        gpuWriteBuffer{i}.unmap();\n        commandEncoder.copyBufferToBuffer(gpuWriteBuffer{i}, 0, {inp_name}, 0, gpuWriteBuffer{i}.size);"  for i,inp_name in enumerate(input_names)])
@@ -266,6 +273,18 @@ const createEmptyBuf = (device, size) => {{
    return device.createBuffer({{size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST }});
 }};

+const createInfinityUniformBuf = (device) => {{
+  const size = 4;
+  const buf = device.createBuffer({{
+    mappedAtCreation: true,
+    size,
+    usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST
+  }});
+  new Float32Array(buf.getMappedRange())[0] = Infinity;
+  buf.unmap();
+  return buf;
+}};
+
 const createWeightBuf = (device, size, data) => {{
  const buf = device.createBuffer({{ mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE }});
  new Uint8Array(buf.getMappedRange()).set(data);
@@ -273,8 +292,15 @@ const createWeightBuf = (device, size, data) => {{
  return buf;
 }};

-const addComputePass = (device, commandEncoder, pipeline, bufs, workgroup) => {{
-  const bindGroup = device.createBindGroup({{layout: pipeline.getBindGroupLayout(0), entries: bufs.map((buffer, index) => ({{ binding: index, resource: {{ buffer }} }}))}});
+const addComputePass = (device, commandEncoder, pipeline, layout, infinityUniformBuf, bufs, workgroup) => {{
+  const bindGroup = device.createBindGroup({{
+    layout: layout,
+    entries: [
+      {{ binding: 0, resource: {{ buffer: infinityUniformBuf }} }},
+      ...bufs.map((buffer, index) => ({{ binding: index + 1, resource: {{ buffer }} }}))
+    ]
+  }});
+
  const passEncoder = commandEncoder.beginComputePass();
  passEncoder.setPipeline(pipeline);
  passEncoder.setBindGroup(0, bindGroup);
@@ -286,6 +312,9 @@ const addComputePass = (device, commandEncoder, pipeline, bufs, workgroup) => {{

 const setupNet = async (device, safetensor) => {{
    const metadata = getTensorMetadata(safetensor);
+    const infinityBuf = createInfinityUniformBuf(device);
+
+    {layouts}

    {_bufs}

@@ -294,7 +323,19 @@ const setupNet = async (device, safetensor) => {{
    {gpu_read_bufs}

    const kernels = [{kernel_names}];
-    const piplines = await Promise.all(kernels.map(name => device.createComputePipelineAsync({{layout: "auto", compute: {{ module: device.createShaderModule({{ code: name }}), entryPoint: "main" }}}})));
+    const pipelines = await Promise.all(kernels.map(async (name, i) => {{
+      return await device.createComputePipelineAsync({{
+          layout: device.createPipelineLayout({{
+              bindGroupLayouts: [layouts[i]],
+          }}),
+          compute: {{
+              module: device.createShaderModule({{
+                  code: name,
+              }}),
+              entryPoint: "main",
+          }},
+      }});
+  }}))

    return async ({",".join([f"_{input_name}" for input_name in input_names])}) => {{
        const commandEncoder = device.createCommandEncoder();
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@ setup(name='tinygrad',
            "bottle",
            "ggml-python"
        ],
+        'webgpu': ["wgpu>=v0.19.0"],
        'docs': [
            "mkdocs",
            "mkdocs-material",
--- a/test/test_arange.py
+++ b/test/test_arange.py
@@ -1,6 +1,6 @@
 import unittest, contextlib
 import numpy as np
-from tinygrad import Tensor, GlobalCounters, dtypes, nn
+from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device
 from tinygrad.helpers import CI, Context, getenv
 from tinygrad.engine.realize import run_schedule
 from tinygrad.codegen.kernel import Opt, OptOps, Kernel, KernelOptError
@@ -139,7 +139,7 @@ class TestIndexing(unittest.TestCase):
      np.testing.assert_equal(X.numpy(), 0)

  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
-  def test_index_mnist(self, noopt=1, op_limit=512*784*5):
+  def test_index_mnist(self, noopt=1, op_limit=512*784*10):
    from tinygrad.nn.datasets import mnist
    X_train, Y_train, _, _ = mnist()
    with Context(NOOPT=noopt, FUSE_ARANGE=1, SPLIT_REDUCEOP=0):
@@ -153,7 +153,7 @@ class TestIndexing(unittest.TestCase):
  @unittest.skip("not ready")
  def test_index_mnist_opt(self): self.test_index_mnist(0)

-  @unittest.skipIf(getenv("PTX"), "broken on ptx for some reason")
+  @unittest.skipIf(getenv("PTX") or Device.DEFAULT == "WEBGPU", "broken on ptx and WebGPU for some reason")
  def test_llama_embedding(self, noopt=1, op_limit=65536):
    # llama3 is 128256
    vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -35,11 +35,11 @@ def _test_to_np(a:Tensor, np_dtype, target):
  except AssertionError as e:
    raise AssertionError(f"\ntensor {a.numpy()} does not match target {target} with np_dtype {np_dtype}") from e

-def _assert_eq(tensor:Tensor, target_dtype:DType, target):
+def _assert_eq(tensor:Tensor, target_dtype:DType, target, tol_target_dtype:float=1e-7):
  if DEBUG >= 2: print(tensor.numpy())
  try:
    assert tensor.dtype == target_dtype
-    np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2}.get(target_dtype, 1e-7))
+    np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2}.get(target_dtype, tol_target_dtype))
  except AssertionError as e:
    raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e

@@ -541,7 +541,7 @@ class TestTypeSpec(unittest.TestCase):
      _assert_eq(Tensor.arange(5, dtype=dtypes.int64), dtypes.int64, np.arange(5))
    if is_dtype_supported(dtypes.float16):
      _assert_eq(Tensor.arange(5, dtype=dtypes.float16), dtypes.float16, np.arange(5))
-    _assert_eq(Tensor.arange(3, 9, 0.7), dtypes.default_float, np.arange(3, 9, 0.7))
+    _assert_eq(Tensor.arange(3, 9, 0.7), dtypes.default_float, np.arange(3, 9, 0.7), 1e-6 if Device.DEFAULT == "WEBGPU" else 1e-7)
    _assert_eq(Tensor.arange(3, 8.5, 3), dtypes.default_float, np.arange(3, 8.5, 3))
    # stop-start and step have different signs
    _assert_eq(Tensor.arange(3, 5, -2), dtypes.default_int, np.arange(3, 5, -2))
--- a/test/test_dtype_alu.py
+++ b/test/test_dtype_alu.py
@@ -11,7 +11,7 @@ from tinygrad.engine.realize import run_schedule
 from tinygrad.ops import GroupOp
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
-import pytest
+import pytest, math
 pytestmark = pytest.mark.filterwarnings("ignore")

 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
@@ -41,8 +41,8 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), (Tensor.sin, np.
 # TODO: (a+b)/2 in tensor.py's maximum can overflow. This requires a new implementation of maximum that can be backpropagated
 #binary_operations += [(Tensor.maximum, np.maximum)]

-# TODO: CI CUDA segfaults on sin
-if getenv("MOCKGPU") and Device.DEFAULT == "NV": unary_operations.remove((Tensor.sin, np.sin))
+# TODO: CI CUDA segfaults on sin, WEBGPU sin is not precise enough for large numbers
+if (getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU": unary_operations.remove((Tensor.sin, np.sin))

 class ht:
  float64 = strat.floats(width=64, allow_subnormal=False)
@@ -88,6 +88,8 @@ def universal_test_cast(a, in_dtype, dtype):
  np.testing.assert_equal(tensor_value.numpy(), numpy_value)

 def universal_test_midcast(a, b, c, op1, op2, d1:DType, d2:DType):
+  # the 'inf' and 'nan' cases are wrong on WEBGPU
+  if (c in [math.inf, -math.inf] or math.isnan(c)) and Device.DEFAULT == "WEBGPU": return
  if not isinstance(op1, tuple): op1 = (op1, op1)
  if not isinstance(op2, tuple): op2 = (op2, op2)
  at, bt, ct = Tensor([a], dtype=d1), Tensor([b], dtype=d1), Tensor([c], dtype=d2)
@@ -148,6 +150,7 @@ class TestDTypeALU(unittest.TestCase):
  @given(ht.int32, ht.int32, strat.sampled_from(integer_binary_operations))
  def test_int32(self, a, b, op): universal_test(a, b, dtypes.int32, op)

+  @unittest.skipUnless(is_dtype_supported(dtypes.int64, Device.DEFAULT), f"no int64 on {Device.DEFAULT}")
  @given(ht.int64, ht.int64, strat.sampled_from(integer_binary_operations))
  def test_int64(self, a, b, op): universal_test(a, b, dtypes.int64, op)

--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -312,7 +312,8 @@ class TestOps(unittest.TestCase):
  def _test_cmp(self, fxn, reverse=True):
    # test different dtypes
    helper_test_op(None, fxn, fxn, forward_only=True, vals=[[0.,1,2], [2.,1,0]])
-    helper_test_op(None, fxn, fxn, forward_only=True, vals=[[0,1,2], [2,1,0]])
+    if is_dtype_supported(dtypes.long):
+      helper_test_op(None, fxn, fxn, forward_only=True, vals=[[0,1,2], [2,1,0]])
    helper_test_op(None, fxn, fxn, forward_only=True, vals=[[True, True, False], [False,True,False]])
    # test broadcasting
    for shps in [[(3, 4, 5), (3, 4, 5)], [(3, 4, 5), (5,)], [(5,), (3, 4, 5)]]:
@@ -382,6 +383,7 @@ class TestOps(unittest.TestCase):
    helper_test_op(None, torch.isinf, Tensor.isinf, vals=[val], forward_only=True)
    np.testing.assert_equal(Tensor(val).isinf(detect_positive=True, detect_negative=False).numpy(), [False, False, True, False, False])
    np.testing.assert_equal(Tensor(val).isinf(detect_positive=False, detect_negative=True).numpy(), [True, False, False, False, False])
+
  def test_isnan(self):
    helper_test_op(None, torch.isnan, Tensor.isnan, vals=[[float('-inf'), 0., float('inf'), float('nan'), 1.1]], forward_only=True)

@@ -499,8 +501,9 @@ class TestOps(unittest.TestCase):
    helper_test_op(None, lambda x: x//2, forward_only=True, vals=np.array([[3, 4, 5]], dtype=np.int32))
    torch_idiv, tiny_idiv = functools.partial(torch.div, rounding_mode="trunc"), Tensor.idiv
    helper_test_op(None, torch_idiv, tiny_idiv, forward_only=True, vals=np.array([[5, -6, 7],[1, 2, 3]], dtype=np.int32))
-    x = Tensor(2**64 - 1, dtype=dtypes.uint64).idiv(1)
-    np.testing.assert_equal(x.numpy(), 2**64 - 1)
+    if is_dtype_supported(dtypes.uint64):
+      x = Tensor(2**64 - 1, dtype=dtypes.uint64).idiv(1)
+      np.testing.assert_equal(x.numpy(), 2**64 - 1)
  def test_scalar_div(self):
    helper_test_op([(45,65)], lambda x: x/255)
    helper_test_op([(45,65)], lambda x: x/1)
@@ -525,6 +528,7 @@ class TestOps(unittest.TestCase):
  def test_pow_full(self):
    helper_test_op([(45,65), (45,65)], lambda x,y: x**y)
    helper_test_op([(45,65), (45,65)], lambda x,y: x.pow(y))
+
  def test_pow(self):
    helper_test_op([(45,65)], lambda x: x**0)
    helper_test_op([(45,65)], lambda x: x**1)
@@ -644,14 +648,14 @@ class TestOps(unittest.TestCase):
    helper_test_op([(45,65)], lambda x: x.sin())
    helper_test_op([()], lambda x: x.sin())
    # works on real CUDA but not CI
-    if not (getenv("MOCKGPU") and Device.DEFAULT == "NV"):
+    if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
      helper_test_op(None, lambda x: x.sin(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
      helper_test_op(None, lambda x: x.sin(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
  def test_cos(self):
    helper_test_op([(45,65)], lambda x: x.cos())
    helper_test_op([()], lambda x: x.cos())
-    if not (getenv("MOCKGPU") and Device.DEFAULT == "NV"):
+    if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
      helper_test_op(None, lambda x: x.sin(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
      helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@@ -660,7 +664,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(45,65)], lambda x: x.tan(), low=-1.5, high=1.5)
    helper_test_op([(45,65)], lambda x: x.tan(), low=-5, high=5, forward_only=True)
    helper_test_op([()], lambda x: x.tan())
-    if not (getenv("MOCKGPU") and Device.DEFAULT == "NV"):
+    if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
      helper_test_op(None, lambda x: x.sin(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
      helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@@ -994,7 +998,8 @@ class TestOps(unittest.TestCase):
                                                                         np.arange(64,128,dtype=np.float32).reshape(8,8)])
  def test_small_gemm_eye(self):
    helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
-  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "LLVM", "GPU", "CUDA"] or IMAGE, "not supported on these in CI/IMAGE")
+  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "LLVM", "GPU", "CUDA"] or IMAGE \
+    or Device.DEFAULT == "WEBGPU", "not supported on these in CI/IMAGE")
  def test_gemm_fp16(self):
    helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3)
  def test_gemm(self):
@@ -1076,8 +1081,9 @@ class TestOps(unittest.TestCase):
    helper_test_op([(45,3)], lambda x: x.min().mul(0.5))
    helper_test_op([()], lambda x: x.min())

-    helper_test_op(None, lambda x: x.type(torch.int32).min(), lambda x: x.cast(dtypes.int32).min(), forward_only=True, vals=[[0, -2**31]])
-    helper_test_op(None, lambda x: x.type(torch.int32).min(), lambda x: x.cast(dtypes.int32).min(), forward_only=True, vals=[[-2**31, 0]])
+    if is_dtype_supported(dtypes.long):
+      helper_test_op(None, lambda x: x.type(torch.int32).min(), lambda x: x.cast(dtypes.int32).min(), forward_only=True, vals=[[0, -2**31]])
+      helper_test_op(None, lambda x: x.type(torch.int32).min(), lambda x: x.cast(dtypes.int32).min(), forward_only=True, vals=[[-2**31, 0]])
    helper_test_op(None, lambda x: x.type(torch.bool).min(), lambda x: x.cast(dtypes.bool).min(), forward_only=True, vals=[[False, True]])
    helper_test_op(None, lambda x: x.type(torch.bool).min(), lambda x: x.cast(dtypes.bool).min(), forward_only=True, vals=[[True, False]])

@@ -1088,8 +1094,9 @@ class TestOps(unittest.TestCase):
    helper_test_op([(3,4,5,6)], lambda x: x.max(axis=1)[0], lambda x: x.max(axis=1))
    helper_test_op([()], lambda x: x.max())

-    helper_test_op(None, lambda x: x.type(torch.int32).max(), lambda x: x.cast(dtypes.int32).max(), forward_only=True, vals=[[0, -2**31]])
-    helper_test_op(None, lambda x: x.type(torch.int32).max(), lambda x: x.cast(dtypes.int32).max(), forward_only=True, vals=[[-2**31, 0]])
+    if is_dtype_supported(dtypes.long):
+      helper_test_op(None, lambda x: x.type(torch.int32).max(), lambda x: x.cast(dtypes.int32).max(), forward_only=True, vals=[[0, -2**31]])
+      helper_test_op(None, lambda x: x.type(torch.int32).max(), lambda x: x.cast(dtypes.int32).max(), forward_only=True, vals=[[-2**31, 0]])
    helper_test_op(None, lambda x: x.type(torch.bool).max(), lambda x: x.cast(dtypes.bool).max(), forward_only=True, vals=[[False, True]])
    helper_test_op(None, lambda x: x.type(torch.bool).max(), lambda x: x.cast(dtypes.bool).max(), forward_only=True, vals=[[True, False]])

@@ -2343,16 +2350,19 @@ class TestOps(unittest.TestCase):
      helper_test_op([(32,10), (32,10)], lambda x,y: torch.nn.functional.cross_entropy(x, y, label_smoothing=ls),
                                         lambda x,y: x.cross_entropy(y, label_smoothing=ls))

+  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"no long on {Device.DEFAULT}")
  def test_nll_loss(self):
    helper_test_op([(32,10), (32)],
                   lambda x,y: torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x, dim=1), torch.clip(y,0).type(torch.long)),
                   lambda x,y: x.log_softmax(axis=1).nll_loss(y.clip(0).cast(dtypes.long)), forward_only=True)

+  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"no long on {Device.DEFAULT}")
  def test_nll_loss_3d(self):
    helper_test_op([(32,10,3,3,3), (32,3,3,3)],
                   lambda x,y: torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x, dim=1), torch.clip(y,0).type(torch.long)),
                   lambda x,y: x.log_softmax(axis=1).nll_loss(y.clip(0).cast(dtypes.long)), forward_only=True)

+  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"no long on {Device.DEFAULT}")
  def test_nll_loss_reductions(self):
    for r in ("mean", "sum", "none"):
      helper_test_op([(32,10), (32)],
@@ -2362,6 +2372,7 @@ class TestOps(unittest.TestCase):
      lambda x,y: torch.nn.functional.nll_loss(x, torch.clip(y,0).type(torch.long), reduction="typo"),
      lambda x,y: x.nll_loss(y.clip(0).cast(dtypes.long), reduction="typo"), expected=ValueError)

+  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"no long on {Device.DEFAULT}")
  def test_nll_loss_weight(self):
    for r in ("mean", "sum", "none"):
      helper_test_op([(32,10), (32), (10)],
@@ -2369,6 +2380,7 @@ class TestOps(unittest.TestCase):
                                                   weight=z, reduction=r),
        lambda x,y,z: x.log_softmax(axis=1).nll_loss(y.clip(0).cast(dtypes.long), weight=z, reduction=r), forward_only=True)

+  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"no long on {Device.DEFAULT}")
  def test_nll_loss_3d_weight(self):
    for r in ("mean", "sum", "none"):
      helper_test_op([(32,10,3,3,3), (32,3,3,3), (10)],
@@ -2376,6 +2388,7 @@ class TestOps(unittest.TestCase):
                                                    weight=z, reduction=r),
          lambda x,y,z: x.log_softmax(axis=1).nll_loss(y.clip(0).cast(dtypes.long), weight=z, reduction=r), forward_only=True)

+  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"no long on {Device.DEFAULT}")
  def test_nll_loss_ignore_index(self):
    logits = [[2.0, 0.5, -1.0],
              [1.5, 2.5, -0.5],
@@ -2405,7 +2418,8 @@ class TestOps(unittest.TestCase):
  @unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
  def test_cast(self):
    helper_test_op([(3, 3)], lambda x: x.float())
-    helper_test_op(None, lambda x: x.float(), vals=[[0, 1, 2, 3]], forward_only=True)
+    if is_dtype_supported(dtypes.long):
+      helper_test_op(None, lambda x: x.float(), vals=[[0, 1, 2, 3]], forward_only=True)
    helper_test_op(None, lambda x: x.float(), vals=[[True, False]], forward_only=True)
    helper_test_op([(3, 3)], lambda x: x.int(), forward_only=True)
    helper_test_op([(3, 3)], lambda x: x.bool(), forward_only=True)
@@ -2440,6 +2454,7 @@ class TestOpsUint8(unittest.TestCase):
      lambda x: torch.nn.functional.interpolate((10*x).relu().type(torch.uint8), size=out_sz, mode="nearest-exact"),
      lambda x: Tensor.interpolate((10*x).relu().cast('uint8'), size=out_sz, mode="nearest-exact"), forward_only=True)

+  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"no long on {Device.DEFAULT}")
  def test_min(self):
    helper_test_op(None,
      lambda x: x.type(torch.uint8).min(),
--- a/test/web/test_webgpu.js
+++ b/test/web/test_webgpu.js
@@ -0,0 +1,65 @@
+const puppeteer = require("puppeteer");
+const { spawn } = require("child_process");
+const res = spawn("python", ["-m", "http.server", "8000"], { shell: true });
+
+async function timeout(time) {
+  return new Promise((resolve) => setTimeout(resolve, time));
+}
+
+function cleanup(err) {
+  console.log("cleaning up");
+  res.kill();
+  if (err != null) {
+    console.error(err);
+    process.exit(1);
+  }
+  process.exit(0);
+}
+
+async function waitForText(selector, text) {
+  let n = 0;
+  let ready = false;
+  while (n < 30) {
+    const res = await (await selector.getProperty("textContent")).jsonValue();
+    console.log(`waiting for text ${text} got ${res}`);
+    if (res == text) {
+      ready = true;
+      break;
+    }
+    await timeout(1000);
+    n += 1;
+  }
+  return ready;
+}
+
+async function runTest() {
+  const browser = await puppeteer.launch({
+    headless: false,
+    args: ["--enable-unsafe-webgpu"],
+  });
+  const page = await browser.newPage();
+
+  page
+    .on("console", (message) =>
+      console.log(`message from console ${message.text()}`),
+    )
+    .on("pageerror", ({ message }) =>
+      console.log(`error from page ${message}`),
+    );
+
+  const res = await page.goto("http://localhost:8000/examples/index.html");
+  if (res.status() !== 200) throw new Error("Failed to load page");
+
+  const textSelector = await page.waitForSelector("#result");
+  const buttonSelector = await page.waitForSelector("input[type=button]");
+  const ready = await waitForText(textSelector, "ready");
+  if (!ready) throw new Error("Failed to load page");
+
+  await buttonSelector.evaluate((e) => e.click());
+  const done = await waitForText(textSelector, "hen");
+  if (!done) throw new Error("failed to get hen");
+
+  cleanup(null);
+}
+
+runTest().catch((err) => cleanup(err));
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -214,7 +214,8 @@ def is_dtype_supported(dtype:DType, device:Optional[str]=None) -> bool:
  if dtype == dtypes.bfloat16:
    # NOTE: this requires bf16 buffer support
    return device in {"AMD"} or (device in {"CUDA", "NV"} and not CI and not getenv("PTX"))
-  if device in ["WEBGPU", "WEBGL"]: return dtype in [dtypes.float, dtypes.int32, dtypes.uint32]
+  if device == "WEBGPU": return dtype in [dtypes.bool, dtypes.char, dtypes.uchar, dtypes.short,
+                                          dtypes.ushort, dtypes.float, dtypes.int32, dtypes.uint32]
  # for CI GPU and OSX, cl_khr_fp16 isn't supported
  # for CI LLVM, it segfaults because it can't link to the casting function
  # CI CUDA architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
--- a/tinygrad/renderer/wgsl.py
+++ b/tinygrad/renderer/wgsl.py
@@ -0,0 +1,99 @@
+from typing import List, Tuple, Optional
+from tinygrad.dtype import DType, PtrDType, dtypes
+from tinygrad.ops import UOp, Ops, PatternMatcher, UPat
+from tinygrad.renderer.cstyle import CStyleLanguage, base_rewrite, extra_pm
+from tinygrad.helpers import strip_parens
+import math
+
+# utility functions for handling packed load/store of < 4-byte data types: bool, char/uchar, short/ushort
+packed_types = {dtypes.bool: dtypes.int, dtypes.char: dtypes.int, dtypes.uchar: dtypes.uint32, dtypes.short: dtypes.int, dtypes.ushort: dtypes.uint32}
+
+def sign_extend(val:UOp, sext_am:int):
+  return (UOp.where((val >> (sext_am - 1)) > 0, UOp.const(dtypes.uint32, 0xffffffff) << sext_am, UOp.const(dtypes.uint32, 0)) \
+        | val.bitcast(dtypes.uint32)).bitcast(dtypes.int)
+
+# store for char: buf[idx/4] <- (var << (idx%4)*8))
+def packed_store(bidx:UOp, var:UOp):
+  shift_am = (bidx.src[1].cast(dtypes.uint32)%UOp.const(dtypes.uint32, 4//var.dtype.itemsize))*UOp.const(dtypes.uint32, 8*var.dtype.itemsize)
+  new_v = (var & (0xFF if var.dtype.itemsize == 1 else 0xFFFF)).cast(dtypes.uint32) << shift_am
+  return UOp.store(UOp(Ops.INDEX, bidx.dtype, (bidx.src[0], bidx.src[1]//(4//var.dtype.itemsize))), new_v.cast(packed_types[var.dtype]))
+
+# load for char: sign_extend(buf[idx/4] >> ((idx%4)*8))
+def packed_load(root:UOp, bidx:UOp, dtype:DType, var:Optional[UOp]=None):
+  div_idx = bidx.src[1]//(4//dtype.itemsize)
+  shift_am = (bidx.src[1].cast(dtypes.uint32)%UOp.const(dtypes.uint32, 4//dtype.itemsize))*UOp.const(dtypes.uint32, 8*dtype.itemsize)
+  if var is not None: load = UOp.load(UOp(Ops.INDEX, bidx.dtype, (bidx.src[0], div_idx)), var, root.src[2], dtype=packed_types[dtype], arg=root.arg)
+  else: load = UOp.load(UOp(Ops.INDEX, bidx.dtype, (bidx.src[0], div_idx)), *root.src[1:], dtype=packed_types[dtype], arg=root.arg)
+  val = (load.cast(dtypes.uint32) >> shift_am) & (0xFF if dtype.itemsize == 1 else 0xFFFF)
+  return sign_extend(val, 8*dtype.itemsize).cast(dtype) if dtype in [dtypes.char, dtypes.short] else val.cast(dtype)
+
+wgsl_matcher = PatternMatcher([
+  (UPat(Ops.CMPLT, src=(UPat(name="a", dtype=dtypes.bool), UPat(name="b")), name="c"),
+    lambda a,b,c: UOp(c.op, c.dtype, (a.cast(dtypes.int), b.cast(dtypes.int)))),
+  (UPat(Ops.XOR, dtype=dtypes.bool, src=(UPat(name="a"), UPat(name="b")), name="c"),
+    lambda a,b,c: UOp(c.op, dtypes.int, (a.cast(dtypes.int), b.cast(dtypes.int))).cast(dtypes.bool)),
+  *[(UPat(a, src=(UPat(name="b", dtype=(dtypes.uint, dtypes.int, dtypes.bool))), name="a"),
+     lambda a,b: UOp(a, dtypes.float, (b.cast(dtypes.float),)).cast(b.dtype)) for a in (Ops.EXP2, Ops.SIN, Ops.LOG2, Ops.SQRT)],
+  (UPat(Ops.LOAD, name="l", src=(UPat.var('b'),)), lambda l,b: packed_load(l,b,l.dtype) if l.dtype.itemsize < 4 else None),
+  (UPat(Ops.LOAD, name="l", src=(UPat.var('b'), UPat.var('c'), UPat())),
+   lambda l,b,c: packed_load(l,b,l.dtype,c.cast(packed_types[l.dtype])) if l.dtype.itemsize < 4 else None),
+  (UPat.store(UPat.var("bidx"), UPat.var("var")), lambda bidx,var: packed_store(bidx,var) if var.dtype.itemsize < 4 else None),
+  (UPat(Ops.MUL, name="m", src=(UPat(name="a"), UPat(Ops.WHERE, src=(UPat.var("g"),
+    UPat(op=Ops.CONST, name="c1"), UPat(op=Ops.CONST, name="c2"))))),
+    lambda m,a,g,c1,c2: UOp(Ops.WHERE, dtype=m.dtype, src=(g, UOp.const(dtype=dtypes.float, b=float('nan')), a))
+    if math.isnan(c1.arg) and c2.arg == 1.0 else None),
+  ]) + extra_pm
+
+type_map = { dtypes.float: "f32", dtypes.uchar: "u32", dtypes.ushort: "u32", dtypes.short: "i32",
+            dtypes.char: "i32", dtypes.int32: "i32", dtypes.uint32: "u32", dtypes.bool: "bool" }
+buffer_map = { **type_map, dtypes.bool: "i32" }
+
+class WGSLRenderer(CStyleLanguage):
+  device = "WEBGPU"
+  global_max = (65535, 65535, 65535)
+  local_max = (256, 256, 64)
+  code_for_workitem = {"g": lambda x: f"i32(gindex.{'xyz'[int(x)]})", "l": lambda x: f"i32(lindex.{'xyz'[int(x)]})"}
+  extra_matcher = wgsl_matcher
+  supports_float4 = False
+  barrier = "workgroupBarrier();"
+  code_for_op = {**CStyleLanguage.code_for_op, Ops.WHERE: lambda a,b,c,dtype: f"select({c},{b},{a})"}
+  nan = "nan()"
+  type_map = type_map
+
+  string_rewrite = PatternMatcher([
+    (UPat(Ops.CONST, dtype=dtypes.bool, name="x"), lambda ctx,x: "true" if x.arg else "false"),
+    (UPat(Ops.CONST, dtype=(dtypes.uchar, dtypes.ushort, dtypes.uint32), name="x"), lambda ctx,x: f"bitcast<u32>({x.arg})" \
+     if x.arg < 0 else f"{x.arg&0xFFFFFFFF}u"),
+    (UPat(Ops.CONST, dtype=dtypes.int32, name="x"), lambda ctx,x: f"bitcast<i32>({x.arg}u)" if x.arg >= 0x80000000 else f"{x.arg}"),
+    (UPat(Ops.DEFINE_LOCAL, name="x"), lambda ctx,x: f"var<workgroup> {ctx[x]}: array<{type_map[x.dtype.base]}, {x.arg[1]}>;"),
+    (UPat(Ops.BITCAST, dtype=(dtypes.char, dtypes.uchar), name="x"), lambda ctx,x: f"bitcast<{type_map[x.dtype]}>({ctx[x.src[0]]}&0xFF)"),
+    (UPat(Ops.BITCAST, dtype=(dtypes.short, dtypes.ushort), name="x"), lambda ctx,x: f"bitcast<{type_map[x.dtype]}>({ctx[x.src[0]]}&0xFFFF)"),
+    (UPat(Ops.BITCAST, name="x"), lambda ctx,x: f"bitcast<{type_map[x.dtype]}>({ctx[x.src[0]]})"),
+    (UPat(Ops.LOAD, src=(UPat.var("b"), UPat.var('v'), UPat.var("g"))), lambda ctx,b,v,g: f"select({ctx[v]}, {ctx[b]}, {ctx[g]})"),
+    (UPat(Ops.LOAD, src=(UPat.var('b'),), allow_any_len=True), lambda ctx, b: ctx[b]),
+    (UPat(Ops.INDEX, src=(UPat.var("buf"), UPat.var('idx'))),
+    lambda ctx,buf,idx: f"{ctx[buf]}[{strip_parens(ctx[idx]) if idx.arg == Ops.ADD else ctx[idx]}]"),
+    (UPat(Ops.STORE, src=(UPat.var('b'), UPat.var("v")), allow_any_len=True),\
+     lambda ctx,b,v: f"atomicAdd(&{ctx[b]}, {ctx[v]});" if b.src[0].dtype.itemsize < 4 else f"{ctx[b]} = {ctx[v]};"),
+    # fix nan check: 'a != a -> is_nan()'
+    (UPat.var("a") != UPat.var("a"), lambda ctx,a: f"is_nan({ctx[a]})"),
+  ]) + base_rewrite
+
+  def render_cast(self, dt:DType, val: str) -> str: return f"{self.type_map[dt]}({val})"
+  def render_dtype(self, dt:DType, mutable=True) -> str: return "var"
+  def render_buf(self, dt:DType, rw:bool) -> str: return f"{f'atomic<{buffer_map[dt]}>' if rw and (dt.itemsize < 4) else buffer_map[dt.base]}"
+  def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,Tuple[DType,bool]]], uops:List[UOp], prefix=None) -> str:
+    local_size = [num for _, num in sorted([u.arg for u in uops if u.op is Ops.SPECIAL and u.arg[0][0] == 'l'], key=lambda x: x[0])]
+    if not local_size: local_size = [1]
+    bind_it = iter(range(len(bufs)))
+    external_local_bufs = [line.lstrip() for line in kernel if "var<workgroup>" in line]
+    kernel[:] = [line for line in kernel if "var<workgroup>" not in line]
+    prg = "fn nan() -> f32 { let bits = 0xffffffffu; return bitcast<f32>(bits); }\n"
+    # trick to obfuscate compiler so that nan is detected properly
+    prg += "fn is_nan(v:f32) -> bool { return min(v, 1.0) == 1.0 && max(v, -1.0) == -1.0; }\n"
+    prg += "@group(0) @binding(0)\nvar<uniform> INFINITY : f32;\n"
+    prg += "\n".join((external_local_bufs or [])+[f"@group(0) @binding({next(bind_it)+1})" +
+      f"{'var<storage,read_write>' if isinstance(dtype, PtrDType) else 'var<uniform>'}" +
+      f"{name}:{f'array<{self.render_buf(dtype.base,rw)}>' if isinstance(dtype, PtrDType) else buffer_map[dtype]};" for name,(dtype,rw) in bufs])
+    prg += f"\n@compute @workgroup_size({','.join([str(x) for x in local_size])}) fn {function_name}(@builtin(workgroup_id) gindex: vec3<u32>,"
+    return prg + "@builtin(local_invocation_id) lindex: vec3<u32>) {\n" + "\n".join(kernel) + "\n}"
--- a/tinygrad/runtime/ops_webgpu.py
+++ b/tinygrad/runtime/ops_webgpu.py
@@ -0,0 +1,65 @@
+import functools
+from tinygrad.device import  Compiled, Allocator, Compiler
+from tinygrad.renderer.wgsl import WGSLRenderer
+from tinygrad.helpers import round_up
+import wgpu
+import struct
+
+def create_uniform(wgpu_device, val) -> wgpu.GPUBuffer:
+  buf = wgpu_device.create_buffer(size=4, usage=wgpu.BufferUsage.UNIFORM | wgpu.BufferUsage.COPY_DST)
+  if isinstance(val, int): wgpu_device.queue.write_buffer(buf, 0, val.to_bytes(4, "little"))
+  else: wgpu_device.queue.write_buffer(buf, 0, struct.pack('<f', val))
+  return buf
+
+class WebGPUProgram:
+  def __init__(self, dev, name:str, lib:bytes):
+    (self.dev, self.timestamp_supported) = dev
+    self.name, self.lib, self.prg = name, lib, self.dev.create_shader_module(code=lib.decode())   # NOTE: this is the compiler
+  def __call__(self, *bufs, global_size=(1,1,1), local_size=(1,1,1), vals=(), wait=False):
+    wait = wait and self.timestamp_supported
+    binding_layouts = [{"binding": 0, "visibility": wgpu.ShaderStage.COMPUTE, "buffer": {"type": wgpu.BufferBindingType.uniform }}]
+    binding_layouts += [{"binding": i+1, "visibility": wgpu.ShaderStage.COMPUTE,
+                        "buffer": {"type": wgpu.BufferBindingType.uniform if i >= len(bufs) else wgpu.BufferBindingType.storage }} for i in range(len(bufs)+len(vals))]  # noqa: E501
+    bindings = [{"binding": 0, "resource": {"buffer": create_uniform(self.dev, float('inf')), "offset": 0, "size": 4}}]
+    bindings += [{"binding": i+1, "resource": {"buffer": create_uniform(self.dev, x) if i >= len(bufs) else x, "offset": 0,
+                                            "size": 4 if i >= len(bufs) else x.size}} for i,x in enumerate(bufs+vals)]  # noqa: E501
+    bind_group_layout = self.dev.create_bind_group_layout(entries=binding_layouts)
+    pipeline_layout = self.dev.create_pipeline_layout(bind_group_layouts=[bind_group_layout])
+    bind_group = self.dev.create_bind_group(layout=bind_group_layout, entries=bindings)
+    compute_pipeline = self.dev.create_compute_pipeline(layout=pipeline_layout,compute={"module": self.prg, "entry_point": self.name},)
+    command_encoder = self.dev.create_command_encoder()
+    if wait:
+      query_set = self.dev.create_query_set(type=wgpu.QueryType.timestamp, count=2)
+      query_buf = self.dev.create_buffer(size=16, usage=wgpu.BufferUsage.QUERY_RESOLVE | wgpu.BufferUsage.COPY_SRC)
+      timestamp_writes = {"query_set": query_set, "beginning_of_pass_write_index": 0, "end_of_pass_write_index": 1}
+    compute_pass = command_encoder.begin_compute_pass(timestamp_writes=timestamp_writes if wait else None) # pylint: disable=E0606
+    compute_pass.set_pipeline(compute_pipeline)
+    compute_pass.set_bind_group(0, bind_group, [], 0, 999999) # last 2 not used
+    compute_pass.dispatch_workgroups(*global_size)  # x y z
+    compute_pass.end()
+    if wait:
+      command_encoder.resolve_query_set(query_set=query_set, first_query=0, query_count=2, destination=query_buf, destination_offset=0)
+    self.dev.queue.submit([command_encoder.finish()])
+    return ((timestamps:=self.dev.queue.read_buffer(query_buf).cast("Q").tolist())[1] - timestamps[0]) / 1e9 if wait else None
+
+# WebGPU buffers have to be 4-byte aligned
+class WebGpuAllocator(Allocator):
+  def __init__(self, dev): self.dev = dev
+  def _alloc(self, size: int, options):
+    return self.dev.create_buffer(size=round_up(size, 4), usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_DST | wgpu.BufferUsage.COPY_SRC)
+  def _copyin(self, dest, src: memoryview):
+    if src.nbytes % 4:
+      padded_src = bytearray(round_up(src.nbytes, 4))
+      padded_src[:src.nbytes] = src
+    self.dev.queue.write_buffer(dest, 0, padded_src if src.nbytes % 4 else src)
+  def _copyout(self, dest: memoryview, src):
+    buffer_data = self.dev.queue.read_buffer(src, 0)
+    dest[:] = buffer_data[:dest.nbytes] if src._nbytes > dest.nbytes else buffer_data
+
+class WebGpuDevice(Compiled):
+  def __init__(self, device:str):
+    adapter = wgpu.gpu.request_adapter_sync(power_preference="high-performance")
+    timestamp_supported = wgpu.FeatureName.timestamp_query in adapter.features
+    wgpu_device = adapter.request_device_sync(required_features=[wgpu.FeatureName.timestamp_query] if timestamp_supported else [])
+    super().__init__(device, WebGpuAllocator(wgpu_device), WGSLRenderer(), Compiler(),
+                     functools.partial(WebGPUProgram, (wgpu_device, timestamp_supported)))