diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 6c2d54be9e..d1aadc9498 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -64,7 +64,7 @@ jobs:
       run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
     - name: Test AMX tensor cores
       run: |
-        DEBUG=2 CLANG=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
+        DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
         DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded
     - name: Run Tensor Core GEMM (float)
       run: DEBUG=2 python3.11 extra/gemm/simple_matmul.py | tee matmul.txt
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4df40f654f..44a1de2d3b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -81,7 +81,7 @@ jobs:
       run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
     - name: Compile EfficientNet to C and test it
       run: |
-        CLANG=1 PYTHONPATH="." python examples/compile_efficientnet.py > recognize.c
+        CPU=1 PYTHONPATH="." python examples/compile_efficientnet.py > recognize.c
         clang -O2 recognize.c -lm -o recognize
         cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock
 
@@ -355,13 +355,13 @@ jobs:
           llvm: 'true'
       - name: Test ONNX (GPU)
         run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
-      - name: Test ONNX (CLANG)
-        run: CLANG=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - name: Test ONNX (CPU)
+        run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       - name: Test ONNX (LLVM)
         run: LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       - name: Run CLOUD=1 Test
         run: |
-          CLOUDDEV=CLANG CLOUD=1 python3 test/test_tiny.py
+          CLOUDDEV=CPU CLOUD=1 python3 test/test_tiny.py
           CLOUDDEV=GPU CLOUD=1 python3 test/test_tiny.py
           CLOUDDEV=GPU IMAGE=2 CLOUD=1 python3 test/test_tiny.py
       - name: Test Optimization Helpers
@@ -378,7 +378,7 @@ jobs:
         uses: ./.github/actions/process-replay
 
   testmodels:
-    name: Models (llvm+clang+gpu)
+    name: Models (llvm+cpu+gpu)
     runs-on: ubuntu-22.04
     timeout-minutes: 10
     steps:
@@ -395,8 +395,8 @@ jobs:
         run: LLVM=1 python -m pytest -n=auto test/models --durations=20
       - name: Test models (gpu)
         run: GPU=1 python -m pytest -n=auto test/models --durations=20
-      - name: Test models (clang)
-        run: CLANG=1 python -m pytest -n=auto test/models --durations=20
+      - name: Test models (cpu)
+        run: CPU=1 python -m pytest -n=auto test/models --durations=20
       - name: Run process replay tests
         uses: ./.github/actions/process-replay
 
@@ -431,8 +431,8 @@ jobs:
       run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py
     - name: Test LLVM=1 DEVECTORIZE=0
       run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
-    - name: Test CLANG=1 DEVECTORIZE=0
-      run: CLANG=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
+    - name: Test CPU=1 DEVECTORIZE=0
+      run: CPU=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
 
   testwebgpu:
     name: Linux (WebGPU)
@@ -464,7 +464,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: [llvm, clang, gpu, ptx, amd, nv] #, triton]
+        backend: [llvm, cpu, gpu, ptx, amd, nv] #, triton]
 
     name: Linux (${{ matrix.backend }})
     runs-on: ubuntu-22.04
@@ -482,10 +482,10 @@ jobs:
           amd: ${{ matrix.backend == 'amd' && 'true' }}
           cuda: ${{ (matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv') && 'true' }}
       - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nPTX=1\nMOCKGPU=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nNV=1\nMOCKGPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nPTX=1\nMOCKGPU=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nNV=1\nMOCKGPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
       - name: Check Device.DEFAULT and print some source
         run: |
-          PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU','AMD','NV'], Device.DEFAULT"
+          PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CPU','CUDA','GPU','AMD','NV'], Device.DEFAULT"
           DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Run pytest (not cuda or amd)
         if: matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv'
@@ -582,7 +582,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: [metal, llvm, clang]
+        backend: [metal, llvm, cpu]
     name: MacOS (${{ matrix.backend }})
     runs-on: macos-15
     timeout-minutes: 10
@@ -596,7 +596,7 @@ jobs:
           deps: testing_minimal
           llvm: ${{ matrix.backend == 'llvm' && 'true' }}
       - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'metal' && 'METAL=1\nJIT=2'}}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1\nJIT=2'}}" >> $GITHUB_ENV
       - name: Check Device.DEFAULT and print some source
         run: |
           python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT"
@@ -612,7 +612,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: [llvm, clang]
+        backend: [llvm, cpu]
 
     name: Windows (${{ matrix.backend }})
     runs-on: windows-latest
@@ -627,7 +627,7 @@ jobs:
           deps: testing_unit
       - name: Set env
         shell: bash
-        run:  printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1'}}" >> $GITHUB_ENV
+        run:  printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1'}}" >> $GITHUB_ENV
       - name: Run unit tests
         if: matrix.backend=='llvm'
         run: python -m pytest -n=auto test/unit/ --ignore=test/unit/test_disk_tensor.py --ignore=test/unit/test_elf.py --ignore=test/unit/test_tar.py
diff --git a/README.md b/README.md
index 226c3ddfab..7b0dc08564 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
 tinygrad already supports numerous accelerators, including:
 
 - [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
-- [x] [CLANG (C Code)](tinygrad/runtime/ops_clang.py)
+- [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py)
 - [x] [LLVM](tinygrad/runtime/ops_llvm.py)
 - [x] [METAL](tinygrad/runtime/ops_metal.py)
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
diff --git a/docs/abstractions2.py b/docs/abstractions2.py
index 4d0ceacb7a..1998cb3383 100644
--- a/docs/abstractions2.py
+++ b/docs/abstractions2.py
@@ -7,7 +7,7 @@
 
 print("******** first, the runtime ***********")
 
-from tinygrad.runtime.ops_clang import ClangJITCompiler, MallocAllocator, CPUProgram
+from tinygrad.runtime.ops_cpu import ClangJITCompiler, MallocAllocator, CPUProgram
 
 # allocate some buffers
 out = MallocAllocator.alloc(4)
@@ -34,7 +34,7 @@ assert val == 5
 
 print("******** second, the Device ***********")
 
-DEVICE = "CLANG"   # NOTE: you can change this!
+DEVICE = "CPU"   # NOTE: you can change this!
 
 import struct
 from tinygrad.dtype import dtypes
@@ -90,7 +90,7 @@ out = a.alu(Ops.ADD, b)
 
 # schedule the computation as a list of kernels
 sched, _, becomes_map = create_schedule_with_vars(out.sink())
-for si in sched: print(si.ast.op)  # NOTE: the first two convert it to CLANG
+for si in sched: print(si.ast.op)  # NOTE: the first two convert it to CPU
 # NOTE: UOps are no longer mutable, the scheduler gives you a map to lookup which BUFFER the result was written to
 out = becomes_map[out]
 
diff --git a/docs/developer/runtime.md b/docs/developer/runtime.md
index 0c8a9d5ed9..b246dfeb0e 100644
--- a/docs/developer/runtime.md
+++ b/docs/developer/runtime.md
@@ -38,7 +38,7 @@ The `Allocator` class is responsible for managing memory on the device. There is
 
 The `Program` class is created for each loaded program. It is responsible for executing the program on the device. As an example, here is a `CPUProgram` implementation which loads program and runs it.
 
-::: tinygrad.runtime.ops_clang.CPUProgram
+::: tinygrad.runtime.ops_cpu.CPUProgram
     options:
         members: true
 
diff --git a/docs/env_vars.md b/docs/env_vars.md
index 4e4d3696de..2ac4066141 100644
--- a/docs/env_vars.md
+++ b/docs/env_vars.md
@@ -31,13 +31,13 @@ These control the behavior of core tinygrad even when used as a library.
 Variable | Possible Value(s) | Description
 ---|---|---
 DEBUG               | [1-6]      | enable debugging output, with 4 you get operations, timings, speed, generated code and more
-GPU                 | [1]        | enable the GPU backend
+GPU                 | [1]        | enable the GPU (OpenCL) backend
 CUDA                | [1]        | enable CUDA backend
 AMD                 | [1]        | enable AMD backend
 NV                  | [1]        | enable NV backend
 METAL               | [1]        | enable Metal backend (for Mac M1 and after)
 METAL_XCODE         | [1]        | enable Metal using macOS Xcode SDK
-CLANG               | [1]        | enable Clang backend
+CPU                 | [1]        | enable CPU (Clang) backend
 LLVM                | [1]        | enable LLVM backend
 BEAM                | [#]        | number of beams in kernel beam search
 DEFAULT_FLOAT       | [HALF, ...]| specify the default float dtype (FLOAT32, HALF, BFLOAT16, FLOAT64, ...), default to FLOAT32
diff --git a/docs/mnist.md b/docs/mnist.md
index 8aae08f241..ce55890eb9 100644
--- a/docs/mnist.md
+++ b/docs/mnist.md
@@ -17,7 +17,7 @@ from tinygrad import Device
 print(Device.DEFAULT)
 ```
 
-You will see `CUDA` here on a GPU instance, or `CLANG` here on a CPU instance.
+You will see `CUDA` here on a GPU instance, or `CPU` here on a CPU instance.
 
 ## A simple model
 
diff --git a/docs/runtime.md b/docs/runtime.md
index 93ee07c46f..af78854b03 100644
--- a/docs/runtime.md
+++ b/docs/runtime.md
@@ -1,6 +1,6 @@
 # Runtimes
 
-tinygrad supports various runtimes, enabling your code to scale across a wide range of devices. The default runtime can be automatically selected based on the available hardware, or you can force a specific runtime to be default using environment variables (e.g., `CLANG=1`).
+tinygrad supports various runtimes, enabling your code to scale across a wide range of devices. The default runtime can be automatically selected based on the available hardware, or you can force a specific runtime to be default using environment variables (e.g., `CPU=1`).
 
 | Runtime | Description | Requirements |
 |---------|-------------|--------------|
@@ -10,6 +10,6 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra
 | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support |
 | [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support |
 | [GPU (OpenCL)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_gpu.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device |
-| [CLANG (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_clang.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` |
+| [CPU (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` |
 | [LLVM (LLVM IR)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_llvm.py) | Runs on CPU using the LLVM compiler infrastructure | llvm libraries installed and findable |
 | [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | Dawn library installed and findable. Download binaries [here](https://github.com/wpmed92/pydawn/releases/tag/v0.1.6). |
diff --git a/examples/compile_efficientnet.py b/examples/compile_efficientnet.py
index 1a27c159c4..3690e7ed33 100644
--- a/examples/compile_efficientnet.py
+++ b/examples/compile_efficientnet.py
@@ -15,9 +15,9 @@ if __name__ == "__main__":
   if getenv("WEBGPU"):
     safe_save(get_state_dict(model), (dirname / "net.safetensors").as_posix())
     load_state_dict(model, safe_load(str(dirname / "net.safetensors")))
-  mode = "clang" if getenv("CLANG", "") != "" else "webgpu" if getenv("WEBGPU", "") != "" else ""
+  mode = "clang" if getenv("CPU", "") != "" else "webgpu" if getenv("WEBGPU", "") != "" else ""
   prg, inp_sizes, out_sizes, state = export_model(model, mode, Tensor.randn(1,3,224,224))
-  if getenv("CLANG", "") == "":
+  if getenv("CPU", "") == "":
     ext = "js" if getenv("WEBGPU", "") != "" else "json"
     with open(dirname / f"net.{ext}", "w") as text_file:
       text_file.write(prg)
@@ -68,6 +68,6 @@ if __name__ == "__main__":
     else printf("%s\\n", lbls[best_idx]);
   }""")
 
-    # CLANG=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && DEBUG=1 time ./recognize docs/showcase/stable_diffusion_by_tinygrad.jpg
+    # CPU=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && DEBUG=1 time ./recognize docs/showcase/stable_diffusion_by_tinygrad.jpg
     # category : 281 (tabby, tabby cat) with 9.452788
     print('\n'.join(cprog))
diff --git a/examples/compile_tensorflow.py b/examples/compile_tensorflow.py
index 7733934880..c906592282 100644
--- a/examples/compile_tensorflow.py
+++ b/examples/compile_tensorflow.py
@@ -1,7 +1,7 @@
 # An example to compile a small Tensorflow model to extremely portable C code
 
 import os, sys
-os.environ["CLANG"] = '1'
+os.environ["CPU"] = '1'
 os.environ["JIT"] = '2'
 
 import numpy as np
diff --git a/examples/llm.c/export.py b/examples/llm.c/export.py
index d257b81778..b041aba10e 100755
--- a/examples/llm.c/export.py
+++ b/examples/llm.c/export.py
@@ -2,7 +2,7 @@
 import os
 if "NOOPT" not in os.environ: os.environ["NOOPT"] = "1"
 from tinygrad import Device, nn, Tensor, dtypes, Variable
-Device.DEFAULT = "CLANG"
+Device.DEFAULT = "CPU"
 from train_gpt2 import GPT, GPTConfig
 from tinygrad.helpers import dedup, to_function_name, flatten, getenv, GlobalCounters, ansilen, to_function_name
 from tinygrad.engine.realize import get_kernel, run_schedule
@@ -43,9 +43,9 @@ if __name__ == "__main__":
   ast_dedup = dedup([si.ast for si in sched if si.ast.op is Ops.SINK])
   srcs = {}
   for ast in ast_dedup:
-    k = get_kernel(Device["CLANG"].renderer, ast)
+    k = get_kernel(Device["CPU"].renderer, ast)
     k.linearize()
-    src = Device["CLANG"].renderer.render(to_function_name(k.name), k.uops)
+    src = Device["CPU"].renderer.render(to_function_name(k.name), k.uops)
     srcs[ast] = (k.name, src)
   print("functions:", len(srcs))
   used_buffers = dedup(flatten([si.bufs for si in sched]))
diff --git a/examples/mlperf/dataloader.py b/examples/mlperf/dataloader.py
index 2bde3fe5f1..833915f190 100644
--- a/examples/mlperf/dataloader.py
+++ b/examples/mlperf/dataloader.py
@@ -170,13 +170,13 @@ def batch_load_resnet(batch_size=64, val=False, shuffle=True, seed=None, pad_fir
 
 def process_batch_bert(data: List[dict]) -> dict[str, Tensor]:
   return {
-    "input_ids": Tensor(np.concatenate([s["input_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"),
-    "input_mask": Tensor(np.concatenate([s["input_mask"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"),
-    "segment_ids": Tensor(np.concatenate([s["segment_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"),
-    "masked_lm_positions": Tensor(np.concatenate([s["masked_lm_positions"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"),
-    "masked_lm_ids": Tensor(np.concatenate([s["masked_lm_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"),
-    "masked_lm_weights": Tensor(np.concatenate([s["masked_lm_weights"] for s in data], axis=0), dtype=dtypes.float32, device="CLANG"),
-    "next_sentence_labels": Tensor(np.concatenate([s["next_sentence_labels"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"),
+    "input_ids": Tensor(np.concatenate([s["input_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"),
+    "input_mask": Tensor(np.concatenate([s["input_mask"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"),
+    "segment_ids": Tensor(np.concatenate([s["segment_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"),
+    "masked_lm_positions": Tensor(np.concatenate([s["masked_lm_positions"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"),
+    "masked_lm_ids": Tensor(np.concatenate([s["masked_lm_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"),
+    "masked_lm_weights": Tensor(np.concatenate([s["masked_lm_weights"] for s in data], axis=0), dtype=dtypes.float32, device="CPU"),
+    "next_sentence_labels": Tensor(np.concatenate([s["next_sentence_labels"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"),
   }
 
 def load_file(file: str):
diff --git a/examples/mlperf/helpers.py b/examples/mlperf/helpers.py
index 0c01db2a9a..07469f2757 100644
--- a/examples/mlperf/helpers.py
+++ b/examples/mlperf/helpers.py
@@ -222,11 +222,11 @@ def get_mlperf_bert_model():
 
 def get_fake_data_bert(BS:int):
   return {
-    "input_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CLANG"),
-    "input_mask": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CLANG"),
-    "segment_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CLANG"),
-    "masked_lm_positions": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CLANG"),
-    "masked_lm_ids": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CLANG"),
-    "masked_lm_weights": Tensor.empty((BS, 76), dtype=dtypes.float32, device="CLANG"),
-    "next_sentence_labels": Tensor.empty((BS, 1), dtype=dtypes.int32, device="CLANG"),
+    "input_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
+    "input_mask": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
+    "segment_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"),
+    "masked_lm_positions": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"),
+    "masked_lm_ids": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"),
+    "masked_lm_weights": Tensor.empty((BS, 76), dtype=dtypes.float32, device="CPU"),
+    "next_sentence_labels": Tensor.empty((BS, 1), dtype=dtypes.int32, device="CPU"),
   }
diff --git a/extra/backends/clang_graph.py b/extra/backends/clang_graph.py
index 48fdc439d2..5e558a3ef7 100644
--- a/extra/backends/clang_graph.py
+++ b/extra/backends/clang_graph.py
@@ -5,7 +5,7 @@ from tinygrad.engine.jit import GraphRunner, GraphException
 from tinygrad.device import Buffer, Device
 from tinygrad.engine.realize import ExecItem, CompiledRunner
 from tinygrad.ops import Variable
-from tinygrad.runtime.ops_clang import ClangProgram
+from tinygrad.runtime.ops_cpu import ClangProgram
 from tinygrad.renderer.cstyle import ClangRenderer
 render_dtype = ClangRenderer().render_dtype
 
@@ -30,7 +30,7 @@ class ClangGraph(GraphRunner):
       code.append(f"  {cast(CompiledRunner, ji.prg).p.function_name}({','.join(args)});")
     code.append("}")
     if DEBUG >= 4: print("\n".join(code))
-    compiler = Device["CLANG"].compiler
+    compiler = Device["CPU"].compiler
     assert compiler is not None
     self._prg = ClangProgram("batched", compiler.compile(prgs+"\n"+"\n".join(code))) # no point in caching the pointers
 
diff --git a/extra/export_model.py b/extra/export_model.py
index 51b48c6f28..edd5b01f49 100644
--- a/extra/export_model.py
+++ b/extra/export_model.py
@@ -8,7 +8,7 @@ from tinygrad.helpers import Context
 from tinygrad.dtype import dtypes
 import json
 
-EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CLANG", "CUDA", "GPU"]
+EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "GPU"]
 
 def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str],List[Tuple[str,List[str],List[int]]],Dict[str,Tuple[int,DType,int]],Dict[str,Tensor]]:
   functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0
@@ -191,7 +191,7 @@ export default {exported_name};
 """
 
 def export_model(model, target:str, *inputs, model_name: Optional[str] = None):
-  assert Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, "only WEBGPU, CLANG, CUDA, GPU, METAL are supported"
+  assert Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, "only WEBGPU, CPU, CUDA, GPU, METAL are supported"
   with Context(JIT=2): run,special_names = jit_model(model, *inputs)
   functions, statements, bufs, bufs_to_save = compile_net(run, special_names)
   state = get_state_dict(model)
diff --git a/extra/gemm/tvm_gemm.py b/extra/gemm/tvm_gemm.py
index d09dd36c35..03fe0f7894 100644
--- a/extra/gemm/tvm_gemm.py
+++ b/extra/gemm/tvm_gemm.py
@@ -32,8 +32,8 @@ import os
 from tinygrad.tensor import Tensor
 
 # define the compute
-A = Tensor.rand(M, K, device="clang")
-B = Tensor.rand(K, N, device="clang")
+A = Tensor.rand(M, K, device="CPU")
+B = Tensor.rand(K, N, device="CPU")
 C = (A.reshape(M, 1, K) * B.permute(1,0).reshape(1, N, K)).sum(axis=2)
 
 sched = C.schedule()
@@ -42,6 +42,6 @@ from tinygrad.device import CompilerOptions
 lin = Kernel(sched[-1].ast, CompilerOptions(has_local=False, supports_float4=False))
 #lin.hand_coded_optimizations()
 lin.linearize()
-from tinygrad.runtime.ops_clang import renderer
+from tinygrad.runtime.ops_cpu import renderer
 src = renderer("mmult", lin.uops)
 print(src)
diff --git a/test/external/external_model_benchmark.py b/test/external/external_model_benchmark.py
index 4c0b720df2..6653839ce0 100644
--- a/test/external/external_model_benchmark.py
+++ b/test/external/external_model_benchmark.py
@@ -138,7 +138,7 @@ def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5):
     else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}")
 
 if __name__ == "__main__":
-  devices = [Device.DEFAULT] if getenv("NOCLANG") else [Device.DEFAULT, "CLANG"]
+  devices = [Device.DEFAULT] if getenv("NOCLANG") else [Device.DEFAULT, "CPU"]
   if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), devices, True)
   else:
     for m in MODELS: benchmark_model(m, devices, True)
diff --git a/test/external/external_multi_gpu.py b/test/external/external_multi_gpu.py
index 00c02b41cb..e5dd836c88 100644
--- a/test/external/external_multi_gpu.py
+++ b/test/external/external_multi_gpu.py
@@ -19,8 +19,8 @@ if __name__ == "__main__":
   with Timing("GPU initial sync: "): sync()
 
   with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
-    c0 = (Tensor.ones(sz, device="clang")/2).realize()
-    c1 = (Tensor.ones(sz, device="clang")/4).realize()
+    c0 = (Tensor.ones(sz, device="CPU")/2).realize()
+    c1 = (Tensor.ones(sz, device="CPU")/4).realize()
     print(c0.lazydata.base.realized)
     print(c1.lazydata.base.realized)
 
diff --git a/test/external/external_test_example.py b/test/external/external_test_example.py
index d6149d1467..1dc0b7b547 100644
--- a/test/external/external_test_example.py
+++ b/test/external/external_test_example.py
@@ -23,10 +23,10 @@ def multidevice_test(fxn):
 
 class TestExample(unittest.TestCase):
   @multidevice_test
-  def test_convert_to_clang(self, device):
+  def test_convert_to_cpu(self, device):
     a = Tensor([[1,2],[3,4]], device=device)
     assert a.numpy().shape == (2,2)
-    b = a.to("CLANG")
+    b = a.to("CPU")
     assert b.numpy().shape == (2,2)
 
   @multidevice_test
diff --git a/test/imported/test_indexing.py b/test/imported/test_indexing.py
index 21c5b251cb..271aeb586b 100644
--- a/test/imported/test_indexing.py
+++ b/test/imported/test_indexing.py
@@ -181,7 +181,7 @@ class TestIndexing(unittest.TestCase):
     # self.assertRaises(TypeError, delitem)
 
   # TODO: LLVM is quite fast, why are other compiled backends slow?
-  @unittest.skipIf(CI and Device.DEFAULT in ["CLANG", "GPU", "METAL", "NV", "AMD"], "slow")
+  @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "GPU", "METAL", "NV", "AMD"], "slow")
   def test_advancedindex(self):
     # integer array indexing
 
diff --git a/test/models/test_mnist.py b/test/models/test_mnist.py
index e51555f507..2f5e939862 100644
--- a/test/models/test_mnist.py
+++ b/test/models/test_mnist.py
@@ -49,7 +49,7 @@ class TinyConvNet:
     x = x.reshape(shape=[x.shape[0], -1])
     return x.dot(self.l1)
 
-@unittest.skipIf(CI and Device.DEFAULT == "CLANG", "slow")
+@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
 class TestMNIST(unittest.TestCase):
   def test_sgd_onestep(self):
     np.random.seed(1337)
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index 5fa29defc9..17a725b3bf 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -48,7 +48,7 @@ class TestRealWorld(unittest.TestCase):
   def tearDown(self):
     dtypes.default_float = self.old_float
 
-  @unittest.skipIf(CI and Device.DEFAULT == "CLANG", "slow, covered by METAL")
+  @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow, covered by METAL")
   @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16")
   def test_stable_diffusion(self):
     params = unet_params
@@ -95,7 +95,7 @@ class TestRealWorld(unittest.TestCase):
       with Context(JIT=0): return model(t, v).realize()
     helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23 if CI else 0.9, 137 if CI else 396, all_jitted=True)
 
-  @unittest.skipIf(CI and Device.DEFAULT == "CLANG", "slow")
+  @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
   def test_train_mnist(self):
     from examples.beautiful_mnist import Model
     with Tensor.train():
@@ -113,7 +113,7 @@ class TestRealWorld(unittest.TestCase):
 
       helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.07, 92)
 
-  @unittest.skipIf(CI and Device.DEFAULT in {"CLANG", "GPU", "LLVM"}, "slow")
+  @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "GPU", "LLVM"}, "slow")
   def test_train_cifar(self):
     with Tensor.train():
       model = SpeedyResNet(Tensor.ones((12,3,2,2)))
diff --git a/test/models/test_whisper.py b/test/models/test_whisper.py
index 458e8b69c5..f1696fd490 100644
--- a/test/models/test_whisper.py
+++ b/test/models/test_whisper.py
@@ -16,7 +16,7 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc
 TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3'
 TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time."   # noqa: E501
 
-@unittest.skipIf(CI and Device.DEFAULT in ["CLANG"], "slow")
+@unittest.skipIf(CI and Device.DEFAULT in ["CPU"], "slow")
 @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support")
 class TestWhisper(unittest.TestCase):
   @classmethod
diff --git a/test/test_copy_speed.py b/test/test_copy_speed.py
index 0c3a85fc98..5d0cc69c26 100644
--- a/test/test_copy_speed.py
+++ b/test/test_copy_speed.py
@@ -24,7 +24,7 @@ class TestCopySpeed(unittest.TestCase):
     s.unlink()
 
   def testCopyCPUtoDefault(self):
-    t = Tensor.ones(N, N, device="clang").contiguous().realize()
+    t = Tensor.ones(N, N, device="CPU").contiguous().realize()
     print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
     for _ in range(3):
       with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
@@ -35,7 +35,7 @@ class TestCopySpeed(unittest.TestCase):
   def testCopyCPUtoDefaultFresh(self):
     print("fresh copy")
     for _ in range(3):
-      t = Tensor.ones(N, N, device="clang").contiguous().realize()
+      t = Tensor.ones(N, N, device="CPU").contiguous().realize()
       with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): # noqa: F821
         with Timing("queue: "):
           t.to(Device.DEFAULT).realize()
@@ -47,14 +47,14 @@ class TestCopySpeed(unittest.TestCase):
     print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
     for _ in range(3):
       with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
-        t.to('clang').realize()
+        t.to('CPU').realize()
 
   @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
   @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
   def testCopyCPUto6GPUs(self):
     from tinygrad.runtime.ops_gpu import CLDevice
     if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
-    t = Tensor.ones(N, N, device="clang").contiguous().realize()
+    t = Tensor.ones(N, N, device="CPU").contiguous().realize()
     print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
     for _ in range(3):
       with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
diff --git a/test/test_fuzz_shape_ops.py b/test/test_fuzz_shape_ops.py
index 2bf7889a40..b90d98cb03 100644
--- a/test/test_fuzz_shape_ops.py
+++ b/test/test_fuzz_shape_ops.py
@@ -38,7 +38,7 @@ def apply(tor, ten, tor_fn, ten_fn=None):
   except: ten, ok = None, not ok  # noqa: E722
   return tor, ten, ok
 
-@unittest.skipIf(CI and Device.DEFAULT in ("CLANG", "NV"), "slow")
+@unittest.skipIf(CI and Device.DEFAULT in ("CPU", "NV"), "slow")
 class TestShapeOps(unittest.TestCase):
   @settings.get_profile(__file__)
   @given(st_shape(), st_int32, st.one_of(st_int32, st.lists(st_int32)))
diff --git a/test/test_jit.py b/test/test_jit.py
index 7abb13100f..7c6bf77206 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -22,7 +22,7 @@ def _simple_test(add, extract=lambda x: x, N=10):
 class TestJit(unittest.TestCase):
 
   @settings(deadline=2e4)
-  @unittest.skipUnless(Device.DEFAULT in ["LLVM", "CLANG"], f"no support on {Device.DEFAULT}")
+  @unittest.skipUnless(Device.DEFAULT in ["LLVM", "CPU"], f"no support on {Device.DEFAULT}")
   @given(strat.sampled_from([Tensor.exp2, Tensor.log2, Tensor.sin]))
   def test_approx_jit_timeout(self, op):
     with Context(TRANSCENDENTAL=2):
@@ -497,8 +497,8 @@ class TestCopyInsideJit(unittest.TestCase):
     @TinyJit
     def add(x,y) -> Tensor: return x.to(Device.DEFAULT)+y
     for _ in range(5):
-      # create a Tensor in CLANG
-      a = Tensor.rand(16,16,device="CLANG").realize()
+      # create a Tensor on CPU
+      a = Tensor.rand(16,16,device="CPU").realize()
       b = Tensor.rand(16,16).realize()
       out = add(a,b)
       np.testing.assert_allclose(out.flatten().tolist(), [x+y for x,y in zip(a.flatten().tolist(), b.flatten().tolist())])
@@ -529,12 +529,12 @@ class TestJitPrune(unittest.TestCase):
     w2_prune = TinyJit(w2, prune=True)
 
     for _ in range(3):
-      a = Tensor.rand(16, device="CLANG").realize()
+      a = Tensor.rand(16, device="CPU").realize()
       out = w2_noprune(a)
       np.testing.assert_allclose(out.tolist(), [x*2+y for x,y in zip(weights.tolist(), a.tolist())])
 
     for _ in range(3):
-      a = Tensor.rand(16, device="CLANG").realize()
+      a = Tensor.rand(16, device="CPU").realize()
       out = w2_prune(a)
       np.testing.assert_allclose(out.tolist(), [x*2+y for x,y in zip(weights.tolist(), a.tolist())])
 
diff --git a/test/test_kernel_cache.py b/test/test_kernel_cache.py
index 851f4d83fd..164b501a41 100644
--- a/test/test_kernel_cache.py
+++ b/test/test_kernel_cache.py
@@ -5,7 +5,7 @@ from tinygrad import Device
 
 class TestKernelCache(unittest.TestCase):
   def test_kernel_cache_in_action(self):
-    if Device.DEFAULT not in ["CLANG"]:
+    if Device.DEFAULT not in ["CPU"]:
       self.skipTest("No custom kernel cache is implemented")
 
     unique_const = 0.6765677269
@@ -16,14 +16,14 @@ class TestKernelCache(unittest.TestCase):
 
     a1 = Tensor.rand(4,4).realize()
     b1 = Tensor.rand(4,4).realize()
-    orig_compile_func = Device['CLANG'].compiler
-    Device['CLANG'].compiler = None # making it not callable
+    orig_compile_func = Device['CPU'].compiler
+    Device['CPU'].compiler = None # making it not callable
 
     try:
       x1 = a1 + b1 + unique_const
       x1.realize() # Same kernel should be from cache.
     finally:
-      Device['CLANG'].compiler = orig_compile_func
+      Device['CPU'].compiler = orig_compile_func
 
 if __name__ == "__main__":
   unittest.main()
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index 9c718b279e..9efa36b038 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -64,7 +64,7 @@ def helper_tc_ensure_uops_and_opts_count(n: int, m:int, k:int, dtype_in:DType, d
 class TestLinearizer(unittest.TestCase):
   def test_arg_dedup(self):
     # NOTE: this realize exists because Tensor.numpy calls .contiguous() internally
-    # without contiguous folding, rand.to("CLANG") and rand.contiguous().to("CLANG") are different UOps.
+    # without contiguous folding, rand.to("CPU") and rand.contiguous().to("CPU") are different UOps.
     # this test asserts they are the identical Buffer
     # having different buffers is fine for correctness, because the outputs match.
     a, b = Tensor.randn(4).realize(), Tensor.randn(4).realize()
@@ -983,8 +983,8 @@ class TestLinearizer(unittest.TestCase):
 
   # NOTE: can reenable, it does work. it just makes BEAM slow
   @unittest.expectedFailure
-  @unittest.skipUnless(Device.DEFAULT == "CLANG", "test only for CLANG")
-  def test_upcast_with_locals_clang(self):
+  @unittest.skipUnless(Device.DEFAULT == "CPU", "test only for CPU")
+  def test_upcast_with_locals_cpu(self):
     out = Tensor.ones(64,64).contiguous() @ Tensor.ones(64,64).contiguous()
     k = Kernel(out.schedule()[-1].ast)
     k.apply_opt(Opt(OptOps.LOCAL, axis=0, arg=4))
@@ -1136,7 +1136,7 @@ class TestLinearizer(unittest.TestCase):
         assert u.src[-1].src[0].op != Ops.ASSIGN
 
   @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
-  @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"}, "CLANG does not support using a different type for accumulation")
+  @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation")
   def test_tensor_cores_unroll_casted_phi(self):
     tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0]
     x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
@@ -1148,7 +1148,7 @@ class TestLinearizer(unittest.TestCase):
         assert u.src[-1].src[0].op != Ops.ASSIGN
 
   @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
-  @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"}, "CLANG does not support using a different type for accumulation")
+  @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation")
   def test_tensor_cores_unroll_casted_phi_with_children(self):
     # all ASSIGN children are outside the loop
     tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0]
@@ -1445,7 +1445,7 @@ class TestFloat4(unittest.TestCase):
 
     assert TestFloat4.count_float4(k) == (2, 1)
 
-  @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "CLANG with AMX upcasts float up to size 16")
+  @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "CPU with AMX upcasts float up to size 16")
   def test_float4_multidim(self):
     a = Tensor.rand(2, 8).realize()
     b = Tensor.rand(2, 8).realize()
@@ -1462,7 +1462,7 @@ class TestFloat4(unittest.TestCase):
 
     assert TestFloat4.count_float4(k) == (4, 2)
 
-  @unittest.skipUnless(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "Only CLANG with AMX upcasts float up to size 16")
+  @unittest.skipUnless(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "Only CPU with AMX upcasts float up to size 16")
   def test_float4_multidim_amx(self):
     def kernel_for_shape(size, shift):
       a = Tensor.rand(2, size).realize()
@@ -1487,7 +1487,7 @@ class TestFloat4(unittest.TestCase):
     for i in range(len(sizes)):
       assert TestFloat4.count_float4(kernel_for_shape(sizes[i], shifts[i]), excepted_upcast_size[i]) == expected_output[i]
 
-  @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "CLANG with AMX upcasts float up to size 16")
+  @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "CPU with AMX upcasts float up to size 16")
   def test_float4_unaligned_load(self):
     a = Tensor.rand(9).realize().shrink(((1, 9),))
     b = Tensor.rand(9).realize().shrink(((1, 9),))
@@ -1500,7 +1500,7 @@ class TestFloat4(unittest.TestCase):
 
     assert TestFloat4.count_float4(k) == (0, 1)
 
-  @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "CLANG with AMX upcasts float up to size 16")
+  @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "CPU with AMX upcasts float up to size 16")
   def test_float4_multidim_unaligned_load(self):
     a = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
     b = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
@@ -1517,7 +1517,7 @@ class TestFloat4(unittest.TestCase):
 
     assert TestFloat4.count_float4(k) == (0, 2)
 
-  @unittest.skipUnless(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "Only CLANG with AMX upcasts float up to size 16")
+  @unittest.skipUnless(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "Only CPU with AMX upcasts float up to size 16")
   def test_float4_multidim_unaligned_load_amx(self):
     def kernel_for_shape(size, shift):
       a = Tensor.rand(2, size).realize().shrink(((0, 2), (1, size),))
diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py
index b37bb524b3..6e70530b29 100644
--- a/test/test_linearizer_failures.py
+++ b/test/test_linearizer_failures.py
@@ -498,7 +498,7 @@ class TestLinearizerFailures(unittest.TestCase):
     opts = [Opt(op=OptOps.PADTO, axis=0, arg=32)]
     helper_test_lin(Kernel(ast), opts, failed_platforms=[])
 
-  #@unittest.skipIf(Device.DEFAULT in ("LLVM", "METAL", "CLANG"), "flaky")
+  #@unittest.skipIf(Device.DEFAULT in ("LLVM", "METAL", "CPU"), "flaky")
   @unittest.skip("flaky everywhere")
   def test_failure_22(self):
     ast = UOp(Ops.SINK, dtypes.void, arg=None, src=(
diff --git a/test/test_pickle.py b/test/test_pickle.py
index 3bfae36c89..be09ed7d57 100644
--- a/test/test_pickle.py
+++ b/test/test_pickle.py
@@ -38,7 +38,7 @@ class TestPickle(unittest.TestCase):
 
   def test_pickle_realized_tensor_alt(self):
     print("** init")
-    t = Tensor.rand(10, 10).to("CLANG").realize()
+    t = Tensor.rand(10, 10).to("CPU").realize()
     st = pickle.dumps(t)
     t_values = t.numpy()
     del t # free buffers
@@ -50,7 +50,7 @@ class TestPickle(unittest.TestCase):
 
   def test_pickle_realized_tensor_alt2(self):
     print("** init")
-    t = Tensor.rand(10, 10).to("CLANG").realize()
+    t = Tensor.rand(10, 10).to("CPU").realize()
     tensor_uop = t.lazydata
     assert tensor_uop.is_realized, f"expected {tensor_uop} to be realized"
     t_values = t.numpy()
@@ -93,7 +93,7 @@ class TestPickle(unittest.TestCase):
     np.testing.assert_equal(vt2.numpy(), 20)
 
   def test_pickle_buffer_view(self):
-    t = Tensor.arange(10, device="CLANG").contiguous().realize()
+    t = Tensor.arange(10, device="CPU").contiguous().realize()
     vt = t[3:5].contiguous().realize()
     assert hasattr(vt.lazydata.buffer, 'base')
     ref_value = vt.tolist()
diff --git a/test/test_randomness.py b/test/test_randomness.py
index fa351082a9..0581ee65ff 100644
--- a/test/test_randomness.py
+++ b/test/test_randomness.py
@@ -239,7 +239,7 @@ class TestRandomness(unittest.TestCase):
                                        numpy_func=lambda x: np.random.randint(low=-2, high=5, size=x)))
     self.assertTrue(equal_distribution(partial(Tensor.randint, low=-2, high=5, dtype="int32"),
                                        numpy_func=lambda x: np.random.randint(low=-2, high=5, size=x)))
-    self.assertTrue(Tensor.randint(1, device="CLANG").device=="CLANG")
+    self.assertTrue(Tensor.randint(1, device="CPU").device=="CPU")
     # check types of args
     with self.assertRaises(TypeError): Tensor.randint((3, 4), low=0.1, high=3)
     with self.assertRaises(TypeError): Tensor.randint((3, 4), low=0, high=3.5)
diff --git a/test/test_renderer_failures.py b/test/test_renderer_failures.py
index 0868062e03..ebe8a19329 100644
--- a/test/test_renderer_failures.py
+++ b/test/test_renderer_failures.py
@@ -38,7 +38,7 @@ class TestCStyleFailures(unittest.TestCase):
     store = UOp.store(a.index(idx), alu)
     sink = UOp(Ops.SINK, dtypes.void, (store,))
     uops = linearize_uop(full_graph_rewrite(sink, Device[Device.DEFAULT].renderer))
-    # CLANG doesn't use the max function
+    # CPU doesn't use the max function
     ret = _test_uop_result([Tensor([1])], uops)[0]
     self.assertEqual(ret[0], 1)
 
diff --git a/test/test_schedule.py b/test/test_schedule.py
index f877ef076d..f4ae67bf3f 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -1732,16 +1732,16 @@ class TestIndexing(unittest.TestCase):
     self.assertIs(sched[1].ast.op, Ops.BUFFER_VIEW)
     np.testing.assert_equal(a.numpy(), [[4, 5]])
 
-  @unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from ext device")
+  @unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device")
   def test_arange_shrink_copy(self):
-    a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).to("CLANG")
+    a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).to("CPU")
     sched = self.check_schedule(a, 1)
     self.assertIs(sched[-1].ast.op, Ops.COPY)
     np.testing.assert_equal(a.numpy(), [[4, 5]])
 
-  @unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from ext device")
+  @unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device")
   def test_arange_expand_copy(self):
-    a = Tensor.arange(4).reshape(2, 2, 1).expand(2, 2, 2).contiguous().to("CLANG")
+    a = Tensor.arange(4).reshape(2, 2, 1).expand(2, 2, 2).contiguous().to("CPU")
     sched = self.check_schedule(a, 1)
     self.assertIs(sched[1].ast.op, Ops.COPY)
     self.assertIs(sched[0].ast.src[0].src[2].op, Ops.ADD)
@@ -2279,23 +2279,23 @@ class TestConst(unittest.TestCase):
     run_schedule(sched, var_vals)
     self.assertEqual(a.tolist(), 3)
 
-@unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from another device to clang")
+@unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from another device to cpu")
 class TestCopyFolding(unittest.TestCase):
   def test_const_copy_is_free(self):
-    b = Tensor(1).to("CLANG")
+    b = Tensor(1).to("CPU")
     check_schedule(b, 0, filter_sink=False)
     assert b.item() == 1
 
   def test_late_const_copy_folding(self):
     a = Tensor.arange(3).realize()
     zeros = Tensor.zeros(3).realize()
-    b = (a*zeros).to("CLANG")
+    b = (a*zeros).to("CPU")
     run_schedule(check_schedule(b, 0, filter_sink=False))
     self.assertListEqual(b.tolist(), [0, 0, 0])
 
   def test_alu_after_copy(self):
-    a = Tensor.ones((4,)).to("CLANG").lazydata
-    b = Tensor.empty(4, device="CLANG").lazydata
+    a = Tensor.ones((4,)).to("CPU").lazydata
+    b = Tensor.empty(4, device="CPU").lazydata
     add = a+b
     add = schedule_graph_rewrite(add)
     assert all_same([x.device for x in add.src]), f"ALU has different devices! {[x.device for x in add.src]}"
@@ -2348,13 +2348,13 @@ class TestCopyFolding(unittest.TestCase):
   def test_permute_on_disk(self):
     with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().lazydata.base.buffer.as_buffer())
     a = Tensor.empty(4, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_4_permute')}")
-    b = a.reshape(2, 2).permute(1, 0).to("CLANG")
+    b = a.reshape(2, 2).permute(1, 0).to("CPU")
     b.realize()
     self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])
 
   def test_permute_after_shrink(self):
     a = Tensor.arange(5)
-    b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CLANG")
+    b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CPU")
     b.realize()
     self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])
 
@@ -2364,7 +2364,7 @@ class TestCopyFolding(unittest.TestCase):
   def test_permute_after_shrink_on_disk(self):
     with open(temp('dt_arange_5_permute'), "wb") as f: f.write(Tensor.arange(5).realize().lazydata.base.buffer.as_buffer())
     a = Tensor.empty(5, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_5_permute')}")
-    b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CLANG")
+    b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CPU")
     b.realize()
     self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])
 
diff --git a/test/test_tensor.py b/test/test_tensor.py
index 0728a3f41b..7a7d21dabf 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -247,7 +247,7 @@ class TestTinygrad(unittest.TestCase):
     assert a.shape == b.shape, f"shape mismatch {a.shape} != {b.shape}"
 
   def test_rand_like_device(self):
-    a = Tensor.ones(3, 3, device="CLANG")
+    a = Tensor.ones(3, 3, device="CPU")
     b = Tensor.rand_like(a)
     self.assertEqual(b.device, a.device)
 
@@ -326,7 +326,7 @@ class TestTinygrad(unittest.TestCase):
   def test_tensor_from_blob(self):
     x = memoryview(bytearray(16)).cast('I')
 
-    t = Tensor.from_blob(mv_address(x), (4,), dtype=dtypes.int, device="CLANG")
+    t = Tensor.from_blob(mv_address(x), (4,), dtype=dtypes.int, device="CPU")
     z = (t+1)
     np.testing.assert_equal(z.numpy(), [1, 1, 1, 1])
 
@@ -695,7 +695,7 @@ class TestZeroShapeTensor(unittest.TestCase):
 class TestTensorCreationDevice(unittest.TestCase):
   # test auxiliary tensors are created on the same device
   def test_one_hot(self):
-    y = Tensor([1, 2, 3]).to("CLANG")
+    y = Tensor([1, 2, 3]).to("CPU")
     x = y.one_hot(10)
     x.realize()
 
diff --git a/test/test_uop_graph.py b/test/test_uop_graph.py
index ee06194af2..7513bd70bd 100644
--- a/test/test_uop_graph.py
+++ b/test/test_uop_graph.py
@@ -661,7 +661,7 @@ class TestLoadStoreFolder(unittest.TestCase):
     sink = float4_rewrite(sink.sink())
     assert len([x for x in sink.toposort if x.op is Ops.LOAD]) == 1
 
-  @unittest.skipIf(Device.DEFAULT in {"CLANG"} and AMX, "CLANG with AMX upcasts float up to size 16")
+  @unittest.skipIf(Device.DEFAULT in {"CPU"} and AMX, "CPU with AMX upcasts float up to size 16")
   def test_two_load_fold(self):
     buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr())
     load = [UOp(Ops.LOAD, dtypes.float, (buf.index(UOp.const(dtypes.int, i)),)) for i in range(8)]
diff --git a/test/test_uops.py b/test/test_uops.py
index ef9952c5ba..5a2c476822 100644
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -106,11 +106,11 @@ class TestUOps(unittest.TestCase):
             self._equal(f([a,b,c], op, dts), fxn(a,b,c))
 
 class TestFloatUOps(TestUOps):
-  @unittest.skipIf(Device.DEFAULT == "CLANG", 'not supported as uop')
+  @unittest.skipIf(Device.DEFAULT == "CPU", 'not supported as uop')
   def test_exp2(self): self._test_uop_fxn(Ops.EXP2, lambda a: np.exp2(a))
-  @unittest.skipIf(Device.DEFAULT == "CLANG", 'not supported as uop')
+  @unittest.skipIf(Device.DEFAULT == "CPU", 'not supported as uop')
   def test_log2(self): self._test_uop_fxn(Ops.LOG2, lambda a: math.log2(a) if a > 0 else float('-inf' if a==0 else 'nan'))
-  @unittest.skipIf(Device.DEFAULT == "CLANG", 'not supported as uop')
+  @unittest.skipIf(Device.DEFAULT == "CPU", 'not supported as uop')
   def test_sin(self): self._test_uop_fxn(Ops.SIN, lambda a: math.sin(a))
   def test_recip(self): self._test_uop_fxn(Ops.RECIP, lambda a: 1/a if a != 0 else float('inf'))
   def test_sqrt(self): self._test_uop_fxn(Ops.SQRT, lambda a: math.sqrt(a) if a >= 0 else float('nan'))
diff --git a/test/test_uops_stats.py b/test/test_uops_stats.py
index e224933ce3..3845629d45 100644
--- a/test/test_uops_stats.py
+++ b/test/test_uops_stats.py
@@ -65,12 +65,12 @@ class TestMemoryCount(unittest.TestCase):
     _, mem = get_stats(a.assign(a+a))
     self.assertEqual(mem, 1024*1024*2)  # 1 read + 1 write
 
-  @unittest.skipIf(Device.DEFAULT == "CLANG", "test copy to CLANG from other device")
+  @unittest.skipIf(Device.DEFAULT == "CPU", "test copy to CPU from other device")
   def test_copyout(self):
-    a = Tensor.empty(32, dtype=dtypes.uint8).to("CLANG")
+    a = Tensor.empty(32, dtype=dtypes.uint8).to("CPU")
     _, mem = get_stats(a)
     self.assertEqual(mem, 32*1)
-    a = Tensor.empty(32, dtype=dtypes.uint32).to("CLANG")
+    a = Tensor.empty(32, dtype=dtypes.uint32).to("CPU")
     _, mem = get_stats(a)
     self.assertEqual(mem, 32*4)
 
diff --git a/test/test_zero_copy.py b/test/test_zero_copy.py
index 1b999314c3..6f2b2cda0b 100644
--- a/test/test_zero_copy.py
+++ b/test/test_zero_copy.py
@@ -13,7 +13,7 @@ def time_tensor_numpy(out:Tensor):
 
 N = 4096
 class TestZeroCopy(unittest.TestCase):
-  @unittest.skipIf(Device.DEFAULT not in {"CLANG", "LLVM", "METAL"}, "device isn't zero copy")
+  @unittest.skipIf(Device.DEFAULT not in {"CPU", "LLVM", "METAL"}, "device isn't zero copy")
   def test_zero_copy_from_default_to_cpu(self):
     demo = Tensor.rand(1).realize()
     t1 = time_tensor_numpy(demo)
diff --git a/test/testextra/test_f16_decompress.py b/test/testextra/test_f16_decompress.py
index 9cfa936a49..6077f67a90 100644
--- a/test/testextra/test_f16_decompress.py
+++ b/test/testextra/test_f16_decompress.py
@@ -7,7 +7,7 @@ import numpy as np
 
 class TestF16Decompression(unittest.TestCase):
   def test_u32_to_f16(self):
-    a = Tensor.randn(50, dtype=dtypes.float16, device=None if is_dtype_supported(dtypes.float16) else "CLANG:0")
+    a = Tensor.randn(50, dtype=dtypes.float16, device=None if is_dtype_supported(dtypes.float16) else "CPU")
     f16_as_u32 = a.bitcast(dtypes.uint32) if is_dtype_supported(dtypes.float16) else a.bitcast(dtypes.uint32).to(Device.DEFAULT)
     f16 = u32_to_f16(f16_as_u32)
     ref = a.numpy()
diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py
index 6291216d1d..8f37ff7eb0 100644
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -211,7 +211,7 @@ class TestSafetensors(unittest.TestCase):
 def helper_test_disk_tensor(fn, data, np_fxn, tinygrad_fxn=None):
   if tinygrad_fxn is None: tinygrad_fxn = np_fxn
   pathlib.Path(temp(fn)).unlink(missing_ok=True)
-  tinygrad_tensor = Tensor(data, device="CLANG").to(f"disk:{temp(fn)}")
+  tinygrad_tensor = Tensor(data, device="CPU").to(f"disk:{temp(fn)}")
   numpy_arr = np.array(data)
   tinygrad_fxn(tinygrad_tensor)
   np_fxn(numpy_arr)
@@ -251,7 +251,7 @@ class TestDiskTensor(unittest.TestCase):
   def test_write_ones(self):
     pathlib.Path(temp("dt_write_ones")).unlink(missing_ok=True)
 
-    out = Tensor.ones(10, 10, device="CLANG").contiguous()
+    out = Tensor.ones(10, 10, device="CPU").contiguous()
     outdisk = out.to(f"disk:{temp('dt_write_ones')}")
     print(outdisk)
     outdisk.realize()
@@ -289,13 +289,13 @@ class TestDiskTensor(unittest.TestCase):
   def test_bitcast(self):
     with open(temp('dt_bitcast'), "wb") as f: f.write(bytes(range(10,20)))
     t = Tensor.empty(5, dtype=dtypes.int16, device=f"disk:{temp('dt_bitcast')}")
-    ret = t.to("CLANG").bitcast(dtypes.uint16) + 1
+    ret = t.to("CPU").bitcast(dtypes.uint16) + 1
     assert ret.tolist() == [2827, 3341, 3855, 4369, 4883]
 
   def test_bitcast_view(self):
     with open(temp('dt_bitcast_view'), "wb") as f: f.write(bytes(range(10, 24)))
     t = Tensor.empty(3, dtype=dtypes.uint, device=f"disk:{temp('dt_bitcast_view')}").shrink([(0, 2)])
-    ret = t.bitcast(dtypes.uint16).to("CLANG") + 1
+    ret = t.bitcast(dtypes.uint16).to("CPU") + 1
     assert ret.tolist() == [2827, 3341, 3855, 4369]
 
   @unittest.skipIf(OSX, "new LLVM has an issue on OSX")
@@ -363,10 +363,10 @@ class TestPathTensor(unittest.TestCase):
     np.testing.assert_array_equal(t.numpy(), np.frombuffer(self.test_data, dtype=np.uint8))
 
   def test_path_tensor_with_device(self):
-    t = Tensor(self.test_file, device="CLANG")
+    t = Tensor(self.test_file, device="CPU")
     self.assertEqual(t.shape, (100,))
     self.assertEqual(t.dtype, dtypes.uint8)
-    self.assertEqual(t.device, "CLANG")
+    self.assertEqual(t.device, "CPU")
     np.testing.assert_array_equal(t.numpy(), np.frombuffer(self.test_data, dtype=np.uint8))
 
   def test_path_tensor_empty_file(self):
@@ -391,8 +391,8 @@ class TestPathTensor(unittest.TestCase):
 
   def test_path_tensor_copy_to_device(self):
     t = Tensor(self.test_file)
-    t_cpu = t.to("CLANG")
-    self.assertEqual(t_cpu.device, "CLANG")
+    t_cpu = t.to("CPU")
+    self.assertEqual(t_cpu.device, "CPU")
     np.testing.assert_array_equal(t_cpu.numpy(), np.frombuffer(self.test_data, dtype=np.uint8))
 
 if __name__ == "__main__":
diff --git a/test/unit/test_elf.py b/test/unit/test_elf.py
index f55b11e17e..cdf02b7728 100644
--- a/test/unit/test_elf.py
+++ b/test/unit/test_elf.py
@@ -1,5 +1,5 @@
 import unittest, subprocess, platform
-from tinygrad.runtime.ops_clang import ClangJITCompiler
+from tinygrad.runtime.ops_cpu import ClangJITCompiler
 from tinygrad.runtime.support.elf import elf_loader
 
 class TestElfLoader(unittest.TestCase):
diff --git a/test/unit/test_verify_ast.py b/test/unit/test_verify_ast.py
index ccae3810b3..55adb79326 100644
--- a/test/unit/test_verify_ast.py
+++ b/test/unit/test_verify_ast.py
@@ -91,7 +91,7 @@ class TestVerifyAST(unittest.TestCase):
 
   def test_const_view_always_valid(self):
     buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), 0)
-    a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg="CLANG"),), ShapeTracker.from_shape(())),))
+    a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg="CPU"),), ShapeTracker.from_shape(())),))
     st = UOp.store(buf, ShapeTracker.from_shape(()).to_uop(), a.cast(dtypes.float))
     helper_test_verify_ast(st)
 
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index 027ac28791..23fcdff637 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -374,7 +374,7 @@ class Kernel:
       check(smem_sz <= self.opts.shared_max, f"exceeds maximum shared memory size: needs {smem_sz}, max {self.opts.shared_max}")
 
     if opt.op is OptOps.LOCAL:    # cyan
-      # NOTE: LLVM/CLANG can use locals too, but they are treated the same as globals (still helpful for L1 cache)
+      # NOTE: LLVM/CPU can use locals too, but they are treated the same as globals (still helpful for L1 cache)
       # it's disabled for now since it makes BEAM slow for little gain
       check(self.opts.has_local, "target does not support local")
       check(axis < self.global_dims, "local is for globals")
diff --git a/tinygrad/device.py b/tinygrad/device.py
index eb60565fdc..ce08cbc081 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -10,7 +10,7 @@ from tinygrad.renderer import Renderer
 
 # **************** Device ****************
 
-ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CLANG", "LLVM", "DSP", "WEBGPU"]
+ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CPU", "LLVM", "DSP", "WEBGPU"]
 class _Device:
   def __init__(self) -> None:
     self._devices = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
diff --git a/tinygrad/nn/state.py b/tinygrad/nn/state.py
index 99032cfe63..024ae6aba4 100644
--- a/tinygrad/nn/state.py
+++ b/tinygrad/nn/state.py
@@ -195,8 +195,8 @@ def torch_load(t:Tensor) -> dict[str, Tensor]:
     if tuple(permute_indexes) != tuple(range(len(permute_indexes))):
       intermediate_shape = tuple([shape_strides[x][0] for x in argsort(permute_indexes)])
       assert tuple([shape_strides[i][1] for i in argsort(permute_indexes)]) == strides_for_shape(intermediate_shape), "nonpermutable strides"
-      if DEBUG >= 3: print(f"WARNING: this torch load is slow. CLANG to permute {intermediate_shape} with {permute_indexes}")
-      assert storage[1] != dtypes.bfloat16, "can't CLANG permute BF16"
+      if DEBUG >= 3: print(f"WARNING: this torch load is slow. to permute {intermediate_shape} with {permute_indexes}")
+      assert storage[1] != dtypes.bfloat16, "can't permute BF16"
       # TODO: find a nice way to support all shapetracker on disktensors
       ret = ret.to(None).reshape(intermediate_shape).permute(permute_indexes)
 
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 4b7fde290a..7b16ffb616 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -181,7 +181,7 @@ class GroupOp:
   All = set(Ops)
 
 # some BUFFER ops can be processed with only a view
-view_supported_devices = {"LLVM", "CLANG", "CUDA", "NV", "AMD", "METAL", "QCOM", "DSP", "DISK"}
+view_supported_devices = {"LLVM", "CPU", "CUDA", "NV", "AMD", "METAL", "QCOM", "DSP", "DISK"}
 
 # https://en.wikipedia.org/wiki/Identity_element
 def identity_element(op:Ops, dt:DType) -> ConstType: return dtypes.as_const({Ops.ADD:0, Ops.MUL:1, Ops.MAX:dtypes.min(dt)}[op], dt)
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 9f0cfd2ca8..bcda50770f 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -18,7 +18,7 @@ base_rewrite = PatternMatcher([
    lambda ctx,x: f"for ({ctx.render_dtype(x.dtype)} {ctx[x]} = {ctx[x.src[0]]}; {ctx[x]} < {ctx[x.src[1]]}; {ctx[x]}++) {{"),
   (UPat(Ops.VECTORIZE, name="x"),
    lambda ctx,x: f"{ctx.float4.replace('float4', ctx.render_dtype(x.dtype))}" + \
-    (f"{{{','.join([ctx[y] for y in x.src])}}}" if ctx.device in {'CLANG', 'DSP'} else f"({','.join([ctx[y] for y in x.src])})")),
+    (f"{{{','.join([ctx[y] for y in x.src])}}}" if ctx.device in {'CPU', 'DSP'} else f"({','.join([ctx[y] for y in x.src])})")),
   (UPat(Ops.CAST, name="x"), lambda ctx,x:
     f"__builtin_convertvector({ctx[x.src[0]]}, {ctx.render_dtype(x.dtype)})" if x.dtype.count > 1 and not isinstance(x.dtype, PtrDType) else None),
   (UPat(Ops.CAST, name="x"), lambda ctx,x: f"({ctx.render_cast(x.dtype, ctx[x.src[0]])})"),
@@ -52,7 +52,7 @@ base_rewrite = PatternMatcher([
   (UPat(GroupOp.ALU, name="x"), lambda ctx,x: ctx.code_for_op[x.op](
     *([strip_parens(ctx[v]) if v.op == x.op and x.op in {Ops.ADD, Ops.MUL, Ops.XOR} else ctx[v] for v in x.src]), x.dtype)),
   (UPat(Ops.GEP, name="x"), lambda ctx,x: ctx[x.src[0]] + \
-    (f"[{x.arg[0]}]" if x.src[0].dtype.count > (8 if ctx.device in {"CUDA", "NV"} else 4) or ctx.device in {'CLANG', 'DSP'} else \
+    (f"[{x.arg[0]}]" if x.src[0].dtype.count > (8 if ctx.device in {"CUDA", "NV"} else 4) or ctx.device in {'CPU', 'DSP'} else \
      f".{'xyzwabcd'[x.arg[0]]}")),
   # custom passes through with format
   (UPat(Ops.CUSTOM, name="x"), lambda ctx,x: x.arg.format(*[ctx[y] for y in x.src])),
@@ -175,7 +175,7 @@ class CStyleLanguage(Renderer):
     return self.render_kernel(name, kernel, list(bufs.values()), uops)
 
 class ClangRenderer(CStyleLanguage):
-  device = "CLANG"
+  device = "CPU"
   float4 = "(float4)"
   has_local = False
   global_max = None
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_cpu.py
similarity index 98%
rename from tinygrad/runtime/ops_clang.py
rename to tinygrad/runtime/ops_cpu.py
index 58eb84b591..b8b34f62c8 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -20,3 +20,5 @@ class ClangJITCompiler(Compiler):
 
 class ClangDevice(Compiled):
   def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
+
+CPUDevice = ClangDevice
\ No newline at end of file
diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py
index 7917928652..3256d88c33 100644
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -17,7 +17,7 @@ class LLVMCompiler(Compiler):
 
     triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
     target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
-    # +reserve-x18 here does the same thing as -ffixed-x18 in ops_clang.py, see comments there for why it's needed on arm osx
+    # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
     cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
     if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
     self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py
index 039ed786c2..427340cb61 100644
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -173,7 +173,7 @@ class PythonProgram:
             # C, D (8 elements on 8 threads)
             def c_map(lane, elem): return (lane, elem)
             ul[i] = wmma_helper(8, 16, 16, 16, 8, a_elem, b_elem, c_map)
-          elif arg[4] == "CLANG":
+          elif arg[4] == "CPU":
             def elem(x, col, row, _): return x[col+row][0] # k is always 0
             def c_map(_, elem): return (elem%16, elem//16)
             ul[i] = wmma_helper(1, 1, 16, 16, 256, elem, elem, c_map)
@@ -194,7 +194,7 @@ class PythonRenderer(Renderer):
     if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm80
     if getenv("EMULATE_CUDA_SM75"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm75
     if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores
-    if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CLANG", ClangRenderer.tensor_cores
+    if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CPU", ClangRenderer.tensor_cores
 
   def render(self, uops:list[UOp]) -> str:
     lops = [(u.op, u.dtype, [uops.index(v) for v in u.src], u.arg) for u in uops]
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 303a747703..a703587078 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -274,7 +274,7 @@ class Tensor(SimpleMathTrait):
   def assign(self, x) -> Tensor:
     # TODO: this is a hack for writing to DISK. remove with working assign
     if isinstance(self.device, str) and self.device.startswith("DISK"):
-      if x.__class__ is not Tensor: x = Tensor(x, device="CLANG", dtype=self.dtype)
+      if x.__class__ is not Tensor: x = Tensor(x, device="CPU", dtype=self.dtype)
       self.contiguous().realize().lazydata.base.realized.ensure_allocated().copyin(x._data())
       return self
     if x.__class__ is not Tensor: x = Tensor(x, device=self.device, dtype=self.dtype)
@@ -297,11 +297,11 @@ class Tensor(SimpleMathTrait):
   def _data(self) -> memoryview:
     if 0 in self.shape: return memoryview(bytearray(0))
     # NOTE: this realizes on the object from as_buffer being a Python object
-    cpu = self.cast(self.dtype.base).contiguous().to("CLANG").realize()
+    cpu = self.cast(self.dtype.base).contiguous().to("CPU").realize()
     buf = cast(UOp, cpu.lazydata).base.realized
     assert buf is not None, f"{cast(UOp, cpu.lazydata).base} was not realized"
-    if self.device != "CLANG": buf.options = BufferSpec(nolru=True)
-    return buf.as_buffer(allow_zero_copy=True if self.device != "CLANG" else False)
+    if self.device != "CPU": buf.options = BufferSpec(nolru=True)
+    return buf.as_buffer(allow_zero_copy=True if self.device != "CPU" else False)
 
   def data(self) -> memoryview:
     """
@@ -520,8 +520,8 @@ class Tensor(SimpleMathTrait):
     if (numel := prod(shape)) == 0: return Tensor.zeros(shape, device=_device, dtype=dtype, **kwargs)
     num = ceildiv(numel * dtype.itemsize, 4)
 
-    # when using MOCKGPU and NV generate rand on CLANG
-    if getenv("MOCKGPU") and device.startswith("NV"): device = "CLANG"
+    # when using MOCKGPU and NV generate rand on CPU
+    if getenv("MOCKGPU") and device.startswith("NV"): device = "CPU"
 
     # generate per device seeds and rng counter if we haven't seen this device yet
     if device not in Tensor._device_seeds: