diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a4c707c2db..6d3f329f91 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -330,8 +330,8 @@ jobs:
     - name: Fuzz Test shape ops
       run: python test/external/fuzz_shape_ops.py
 
-  testgpuimage:
-    name: 'GPU IMAGE Tests'
+  testopenclimage:
+    name: 'CL IMAGE Tests'
     runs-on: ubuntu-22.04
     timeout-minutes: 10
     env:
@@ -345,15 +345,15 @@ jobs:
           key: gpu-image
           deps: testing_minimal
           opencl: 'true'
-      - name: Test GPU IMAGE=2 ops + training
+      - name: Test CL IMAGE=2 ops + training
         run: |
-          GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
-          GPU=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist
+          CL=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
+          CL=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist
       - name: Run process replay tests
         uses: ./.github/actions/process-replay
 
   testgpumisc:
-    name: 'GPU Misc tests'
+    name: 'CL Misc tests'
     runs-on: ubuntu-22.04
     timeout-minutes: 10
     env:
@@ -368,11 +368,11 @@ jobs:
           deps: testing_minimal
           opencl: 'true'
       - name: Generate Dataset
-        run: GPU=1 extra/optimization/generate_dataset.sh
+        run: CL=1 extra/optimization/generate_dataset.sh
       - name: Run Kernel Count Test
-        run: GPU=1 python -m pytest -n=auto test/external/external_test_opt.py
+        run: CL=1 python -m pytest -n=auto test/external/external_test_opt.py
       - name: Run fused optimizer tests
-        run: GPU=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py
+        run: CL=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py
       - name: Upload artifact
         uses: actions/upload-artifact@v4
         with:
@@ -397,17 +397,17 @@ jobs:
           llvm: 'true'
       - name: Test openpilot model kernel count and gate usage
         run: |
-          ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
+          ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
       - name: Test openpilot alt model correctness (float32)
-        run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
+        run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
       - name: Test openpilot fastvits model correctness (float32)
-        run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
+        run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
       # - name: Test openpilot simple_plan vision model correctness (float32)
-      #   run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b
+      #   run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b
       - name: Test openpilot LLVM compile
         run: CPU=1 CPU_LLVM=1 LLVMOPT=1 JIT=2 BEAM=0 IMAGE=0 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
       - name: Test openpilot compile4
-        run: NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py
+        run: NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py
       - name: Run process replay tests
         uses: ./.github/actions/process-replay
 
@@ -459,16 +459,16 @@ jobs:
           pydeps: "tensorflow==2.15.1 tensorflow_addons"
           python-version: '3.11'
           opencl: 'true'
-      - name: Test ONNX (GPU)
-        run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - name: Test ONNX (CL)
+        run: CL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       #- name: Test Optimization Helpers
       #  run: DEBUG=1 python3 extra/optimization/test_helpers.py
       #- name: Test Action Space
-      #  run: DEBUG=1 GPU=1 python3 extra/optimization/get_action_space.py
+      #  run: DEBUG=1 CL=1 python3 extra/optimization/get_action_space.py
       - name: Test Beam Search
-        run: GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
+        run: CL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
       - name: Test MLPerf stuff
-        run: GPU=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20
+        run: CL=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20
       - name: Test llama 3 training
         run: MAX_BUFFER_SIZE=0 DEV=NULL SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
       - name: Run process replay tests
@@ -506,8 +506,8 @@ jobs:
           llvm: 'true'
       - name: Test models (llvm)
         run: CPU=1 CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20
-      - name: Test models (gpu)
-        run: GPU=1 python -m pytest -n=auto test/models --durations=20
+      - name: Test models (opencl)
+        run: CL=1 python -m pytest -n=auto test/models --durations=20
       - name: Test models (cpu)
         run: CPU=1 CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20
       - name: Run process replay tests
@@ -709,7 +709,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: [llvm, cpu, gpu]
+        backend: [llvm, cpu, opencl]
 
     name: Linux (${{ matrix.backend }})
     runs-on: ubuntu-22.04
@@ -725,13 +725,13 @@ jobs:
         with:
           key: ${{ matrix.backend }}-minimal
           deps: testing_minimal
-          opencl: ${{ matrix.backend == 'gpu' && 'true' }}
+          opencl: ${{ matrix.backend == 'opencl' && 'true' }}
           llvm: ${{ matrix.backend == 'llvm' && 'true' }}
       - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'gpu' && 'GPU=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'CL=1' }}" >> $GITHUB_ENV
       - name: Check Device.DEFAULT and print some source
         run: |
-          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','GPU'], Device.DEFAULT"
+          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT"
           DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Run pytest (${{ matrix.backend }})
         run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20
@@ -772,7 +772,7 @@ jobs:
 
           start_server "remote-server-amd-1" "AMD" 6667
           start_server "remote-server-amd-2" "AMD" 6668
-          start_server "remote-server-gpu" "GPU" 7667
+          start_server "remote-server-gpu" "CL" 7667
           start_server "remote-server-cpu" "CPU" 8667
       - name: Check Device.DEFAULT and print some source
         env:
@@ -786,7 +786,7 @@ jobs:
           HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
         run: |
           python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
-      - name: Run REMOTE=1 Test (GPU)
+      - name: Run REMOTE=1 Test (CL)
         env:
           HOST: 127.0.0.1:7667*6
         run: |
diff --git a/README.md b/README.md
index 262ea97d37..dab378a23a 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
 
 tinygrad already supports numerous accelerators, including:
 
-- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
+- [x] [OpenCL](tinygrad/runtime/ops_cl.py)
 - [x] [CPU](tinygrad/runtime/ops_cpu.py)
 - [x] [METAL](tinygrad/runtime/ops_metal.py)
 - [x] [CUDA](tinygrad/runtime/ops_cuda.py)
diff --git a/docs/env_vars.md b/docs/env_vars.md
index 9367eef064..44be042bfa 100644
--- a/docs/env_vars.md
+++ b/docs/env_vars.md
@@ -3,7 +3,7 @@
 This is a list of environment variable that control the runtime behavior of tinygrad and its examples.
 Most of these are self-explanatory, and are usually used to set an option at runtime.
 
-Example: `GPU=1 DEBUG=4 python3 -m pytest`
+Example: `CL=1 DEBUG=4 python3 -m pytest`
 
 However you can also decorate a function to set a value only inside that function.
 
@@ -31,7 +31,7 @@ These control the behavior of core tinygrad even when used as a library.
 Variable | Possible Value(s) | Description
 ---|---|---
 DEBUG               | [1-7]      | enable debugging output (operations, timings, speed, generated code and more)
-GPU                 | [1]        | enable the GPU (OpenCL) backend
+CL                  | [1]        | enable OpenCL backend
 CUDA                | [1]        | enable CUDA backend
 AMD                 | [1]        | enable AMD backend
 NV                  | [1]        | enable NV backend
diff --git a/docs/runtime.md b/docs/runtime.md
index bc85d9bedf..656c6d75c3 100644
--- a/docs/runtime.md
+++ b/docs/runtime.md
@@ -9,7 +9,7 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra
 | [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | 6xx series GPUs |
 | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support |
 | [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support |
-| [GPU (OpenCL)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_gpu.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device |
+| [OpenCL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device |
 | [CPU (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` |
 | [LLVM (LLVM IR)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_llvm.py) | Runs on CPU using the LLVM compiler infrastructure | llvm libraries installed and findable |
 | [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | Dawn library installed and findable. Download binaries [here](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0). |
diff --git a/examples/openpilot/compile4.py b/examples/openpilot/compile4.py
index c57bd3eb70..55fcccbfbf 100644
--- a/examples/openpilot/compile4.py
+++ b/examples/openpilot/compile4.py
@@ -6,7 +6,7 @@ from tinygrad.schedule.kernelize import get_kernelize_map
 from tinygrad.engine.schedule import create_schedule_with_vars
 from tinygrad.engine.realize import run_schedule
 
-# NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py
+# NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py
 
 OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx"
 OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/openpilot.pkl"
diff --git a/extra/archprobe.py b/extra/archprobe.py
index 73eb5037fb..7ba20b2a88 100644
--- a/extra/archprobe.py
+++ b/extra/archprobe.py
@@ -1,7 +1,7 @@
 # copying the kernels from https://github.com/microsoft/ArchProbe into Python
 import numpy as np
 import pickle
-from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer
+from tinygrad.runtime.ops_cl import CLProgram, CLBuffer
 from tinygrad import dtypes
 from tqdm import trange, tqdm
 from matplotlib import pyplot as plt
diff --git a/extra/assembly/assembly_rdna.py b/extra/assembly/assembly_rdna.py
index 0f5ab01ecf..297639d676 100644
--- a/extra/assembly/assembly_rdna.py
+++ b/extra/assembly/assembly_rdna.py
@@ -4,7 +4,7 @@ from tinygrad import dtypes
 from tinygrad.codegen.assembly import AssemblyCodegen, Register
 from tinygrad.codegen.opt.kernel import Ops
 from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
-from tinygrad.runtime.ops_gpu import ROCM_LLVM_PATH
+from tinygrad.runtime.ops_cl import ROCM_LLVM_PATH
 
 # ugh, is this really needed?
 from extra.helpers import enable_early_exec
diff --git a/extra/assembly/rocm/rdna3/asm.py b/extra/assembly/rocm/rdna3/asm.py
index 2f6ad13264..9c65fa7360 100644
--- a/extra/assembly/rocm/rdna3/asm.py
+++ b/extra/assembly/rocm/rdna3/asm.py
@@ -5,7 +5,7 @@ from tinygrad.helpers import colored
 from extra.helpers import enable_early_exec
 early_exec = enable_early_exec()
 
-from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH
+from tinygrad.runtime.ops_cl import CLProgram, CLBuffer, ROCM_LLVM_PATH
 
 ENABLE_NON_ASM = False
 
diff --git a/extra/export_model.py b/extra/export_model.py
index 65a8b3af9f..e29f8a8d31 100644
--- a/extra/export_model.py
+++ b/extra/export_model.py
@@ -10,7 +10,7 @@ from tinygrad.uop.ops import Ops
 import json
 from collections import OrderedDict
 
-EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "GPU"]
+EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "CL"]
 
 def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str],List[Tuple[str,List[str],List[int]]],Dict[str,Tuple[int,DType,int]],Dict[str,Tensor]]:
   functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0
diff --git a/extra/gemm/intel_xmx.py b/extra/gemm/intel_xmx.py
index 8ec478e5f6..719830473a 100644
--- a/extra/gemm/intel_xmx.py
+++ b/extra/gemm/intel_xmx.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 import numpy as np
-from tinygrad.runtime.ops_gpu import CLProgram, CLCompiler
+from tinygrad.runtime.ops_cl import CLProgram, CLCompiler
 from tinygrad import Device, dtypes
 from tinygrad.device import Buffer
 from hexdump import hexdump
@@ -11,7 +11,7 @@ from hexdump import hexdump
 # https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_split_matrix_multiply_accumulate.html
 # https://hc34.hotchips.org/assets/program/conference/day1/GPU%20HPC/Intel_s%20Ponte%20Vecchio%20GPU%20-%20Architecture%20Systems%20and%20Software%20FINAL.pdf
 
-device = Device["GPU"]
+device = Device["CL"]
 
 # NOTE: only the subgroup type 8 ones work
 prog = CLProgram(device, "test", CLCompiler(device, "test").compile(f"""
@@ -26,9 +26,9 @@ __kernel void test(__global float* data0, const __global int* data1, const __glo
 """))
 #with open("/tmp/test.elf", "wb") as f: f.write(prog.lib)
 
-a = Buffer("GPU", 8, dtypes.float32).allocate()
-b = Buffer("GPU", 0x10, dtypes.float16).allocate()
-c = Buffer("GPU", 8*0x10, dtypes.float16).allocate()
+a = Buffer("CL", 8, dtypes.float32).allocate()
+b = Buffer("CL", 0x10, dtypes.float16).allocate()
+c = Buffer("CL", 8*0x10, dtypes.float16).allocate()
 
 row = np.array([1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8], np.float16)
 mat = np.random.random((8, 0x10)).astype(np.float16)
diff --git a/extra/optimization/generate_dataset.sh b/extra/optimization/generate_dataset.sh
index 6f70916979..b843dac700 100755
--- a/extra/optimization/generate_dataset.sh
+++ b/extra/optimization/generate_dataset.sh
@@ -7,7 +7,7 @@ rm $LOGOPS
 test/external/process_replay/reset.py
 
 CI=1 python3 -m pytest -n=auto test/test_ops.py test/test_nn.py test/test_winograd.py test/models/test_real_world.py --durations=20
-GPU=1 python3 -m pytest test/test_tiny.py
+CL=1 python3 -m pytest test/test_tiny.py
 
 # extract, sort and uniq
 extra/optimization/extract_dataset.py
diff --git a/extra/qcom_gpu_driver/qcom_opencl_interop.py b/extra/qcom_gpu_driver/qcom_opencl_interop.py
index d595ba343f..c2e0741ca2 100644
--- a/extra/qcom_gpu_driver/qcom_opencl_interop.py
+++ b/extra/qcom_gpu_driver/qcom_opencl_interop.py
@@ -1,6 +1,6 @@
 import ctypes, array
 from hexdump import hexdump
-from tinygrad.runtime.ops_gpu import GPUDevice
+from tinygrad.runtime.ops_cl import CLDevice
 from tinygrad.helpers import getenv, to_mv, mv_address
 from tinygrad.dtype import dtypes
 from tinygrad import Tensor, TinyJit
@@ -8,7 +8,7 @@ from tinygrad.runtime.autogen import opencl as cl
 if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl  # noqa: F401  # pylint: disable=unused-import
 
 # create raw opencl buffer.
-gdev = GPUDevice()
+gdev = CLDevice()
 cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 0x100, None, status := ctypes.c_int32())
 assert status.value == 0
 
diff --git a/extra/thneed.py b/extra/thneed.py
index c59f636858..ca89bfa603 100644
--- a/extra/thneed.py
+++ b/extra/thneed.py
@@ -4,13 +4,13 @@ import struct
 import json
 import traceback
 import numpy as np
-from tinygrad.runtime.ops_gpu import CLProgram, compile_gpu
+from tinygrad.runtime.ops_cl import CLProgram, compile_gpu
 from tinygrad.device import Device
 from tinygrad.helpers import DEBUG, getenv
 from collections import defaultdict
 import pyopencl as cl
-from tinygrad.runtime.ops_gpu import OSX_TIMING_RATIO
-CL = Device["GPU"]
+from tinygrad.runtime.ops_cl import OSX_TIMING_RATIO
+CL = Device["CL"]
 
 DEBUGCL = getenv("DEBUGCL", 0)
 FLOAT16 = getenv("FLOAT16", 0)
@@ -110,7 +110,7 @@ class Thneed:
     prgs = {}
     for o in jdat['binaries']:
       nptr = ptr + o['length']
-      prgs[o['name']] = CLProgram(Device["GPU"], o['name'], weights[ptr:nptr])
+      prgs[o['name']] = CLProgram(Device["CL"], o['name'], weights[ptr:nptr])
       ptr = nptr
 
     # populate the cl_cache
@@ -267,7 +267,7 @@ class Thneed:
     for prg, args in self.cl_cache:
       events.append(prg.clprg(CL.queue, *args))
     mt = time.monotonic()
-    Device["GPU"].synchronize()
+    Device["CL"].synchronize()
     et = time.monotonic() - st
     print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms")
 
diff --git a/test/device/test_ocl.py b/test/device/test_ocl.py
index 04b8e2523e..6f58b909db 100644
--- a/test/device/test_ocl.py
+++ b/test/device/test_ocl.py
@@ -3,9 +3,9 @@ from tinygrad import Device
 from tinygrad.device import Buffer
 from tinygrad.dtype import dtypes
 from tinygrad.helpers import CI
-from tinygrad.runtime.ops_gpu import CLDevice, CLAllocator, CLCompiler, CLProgram
+from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram
 
-@unittest.skipUnless(Device.DEFAULT == "GPU", "Runs only on OpenCL (GPU)")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
 class TestCLError(unittest.TestCase):
   @unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
   def test_oom(self):
@@ -24,7 +24,7 @@ class TestCLError(unittest.TestCase):
   def test_unaligned_copy(self):
     data = list(range(65))
     unaligned = memoryview(bytearray(data))[1:]
-    buffer = Buffer("GPU", 64, dtypes.uint8).allocate()
+    buffer = Buffer("CL", 64, dtypes.uint8).allocate()
     buffer.copyin(unaligned)
     result = memoryview(bytearray(len(data) - 1))
     buffer.copyout(result)
diff --git a/test/external/external_benchmark_hip_compile.py b/test/external/external_benchmark_hip_compile.py
index 2b1d480348..d97047b923 100644
--- a/test/external/external_benchmark_hip_compile.py
+++ b/test/external/external_benchmark_hip_compile.py
@@ -1,7 +1,7 @@
 import random, os
 from tinygrad.helpers import Timing
 from tinygrad.runtime.ops_hip import compile_hip, HIPDevice
-from tinygrad.runtime.ops_gpu import compile_cl, CLDevice
+from tinygrad.runtime.ops_cl import compile_cl, CLDevice
 
 # OMP_NUM_THREADS=1 strace -tt -f -e trace=file python3 test/external/external_benchmark_hip_compile.py
 # AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 python3 test/external/external_benchmark_hip_compile.py
diff --git a/test/external/external_cl_half_max.py b/test/external/external_cl_half_max.py
index 7cd6b0c509..020d806c63 100644
--- a/test/external/external_cl_half_max.py
+++ b/test/external/external_cl_half_max.py
@@ -1,4 +1,4 @@
-from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, compile_cl
+from tinygrad.runtime.ops_cl import CLDevice, CLProgram, compile_cl
 
 if __name__ == "__main__":
   dev = CLDevice()
diff --git a/test/external/external_gpu_fail_osx.py b/test/external/external_gpu_fail_osx.py
index b11b695e3d..51a458b136 100644
--- a/test/external/external_gpu_fail_osx.py
+++ b/test/external/external_gpu_fail_osx.py
@@ -1,5 +1,5 @@
 # ugh, OS X OpenCL doesn't support half
-from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, CLCompiler
+from tinygrad.runtime.ops_cl import CLDevice, CLProgram, CLCompiler
 
 src = """#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void max_half(__global half* data0, const __global half* data1) {
diff --git a/test/external/external_multi_gpu.py b/test/external/external_multi_gpu.py
index 32d107df7d..b3c8fefb30 100644
--- a/test/external/external_multi_gpu.py
+++ b/test/external/external_multi_gpu.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
-# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
+# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so CL=1 python3 test/external/external_multi_gpu.py
 import numpy as np
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import colored, Timing, getenv
diff --git a/test/external/external_osx_profiling.py b/test/external/external_osx_profiling.py
index 8ac1e584df..277ca304ad 100644
--- a/test/external/external_osx_profiling.py
+++ b/test/external/external_osx_profiling.py
@@ -1,4 +1,4 @@
-from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
+from tinygrad.runtime.ops_cl import CLProgram, CL, CLBuffer
 from tinygrad import dtypes
 import time
 
diff --git a/test/external/external_test_hcq_fuzz_failures.py b/test/external/external_test_hcq_fuzz_failures.py
index c8b6198e9a..4a381d5336 100644
--- a/test/external/external_test_hcq_fuzz_failures.py
+++ b/test/external/external_test_hcq_fuzz_failures.py
@@ -55,7 +55,7 @@ class TestHCQFuzzFailures(unittest.TestCase):
     ast = UOp(Ops.SINK, dtypes.void, arg=None, src=(  UOp(Ops.STORE, dtypes.void, arg=None, src=(    UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()),    UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=0, mask=None, contiguous=True),)), src=()),    UOp(Ops.ADD, dtypes.float, arg=None, src=(      UOp(Ops.ADD, dtypes.float, arg=None, src=(        UOp(Ops.ADD, dtypes.float, arg=None, src=(          UOp(Ops.ADD, dtypes.float, arg=None, src=(            UOp(Ops.ADD, dtypes.float, arg=None, src=(              UOp(Ops.ADD, dtypes.float, arg=None, src=(                UOp(Ops.ADD, dtypes.float, arg=None, src=(                  UOp(Ops.ADD, dtypes.float, arg=None, src=(                    UOp(Ops.ADD, dtypes.float, arg=None, src=(                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                        UOp(Ops.ADD, dtypes.float, arg=None, src=(                          UOp(Ops.ADD, dtypes.float, arg=None, src=(                            UOp(Ops.ADD, dtypes.float, arg=None, src=(                              UOp(Ops.ADD, dtypes.float, arg=None, src=(                                UOp(Ops.ADD, dtypes.float, arg=None, src=(                                  UOp(Ops.ADD, dtypes.float, arg=None, src=(                                    UOp(Ops.ADD, dtypes.float, arg=None, src=(                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                        UOp(Ops.ADD, dtypes.float, arg=None, src=(                                          UOp(Ops.ADD, dtypes.float, arg=None, src=(                                            UOp(Ops.ADD, dtypes.float, arg=None, src=(                                              UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                  UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                    UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                        UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                          UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                            UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                              UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                  UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                    UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=1, src=()),                                                                          x39:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=0, mask=((0, 1), (0, 6)), contiguous=False),)), src=()),)),                                                                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()),                                                                             x39,)),)),)),                                                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=3, src=()),                                                                          x46:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-6, mask=((0, 1), (6, 12)), contiguous=False),)), src=()),)),                                                                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()),                                                                             x46,)),)),)),)),                                                                    UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=5, src=()),                                                                          x54:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (12, 13)), contiguous=False),)), src=()),)),                                                                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()),                                                                             x54,)),)),)),)),)),                                                                  UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                    UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=7, src=()),                                                                    UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-13, mask=((0, 1), (13, 17)), contiguous=False),)), src=()),)),)),                                                                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                  UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=8, src=()),                                                                  UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-17, mask=((0, 1), (17, 21)), contiguous=False),)), src=()),)),)),                                                              UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                                UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                                  UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                    UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=9, src=()),                                                                    x68:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (21, 22)), contiguous=False),)), src=()),)),                                                                  UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                                    UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                      UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=10, src=()),                                                                       x68,)),)),)),)),)),                                                            UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                              UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=11, src=()),                                                              UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-22, mask=((0, 1), (22, 26)), contiguous=False),)), src=()),)),)),                                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=12, src=()),                                                            UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-26, mask=((0, 1), (26, 30)), contiguous=False),)), src=()),)),)),                                                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                          UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                            UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                              UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=13, src=()),                                                              x82:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (30, 31)), contiguous=False),)), src=()),)),                                                            UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                                UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=14, src=()),                                                                 x82,)),)),)),)),)),                                                      UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                        UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                            UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=15, src=()),                                                            x90:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (31, 32)), contiguous=False),)), src=()),)),                                                          UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                            UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                              UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=16, src=()),                                                               x90,)),)),)),)),)),                                                    UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=17, src=()),                                                          x98:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (32, 33)), contiguous=False),)), src=()),)),                                                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=18, src=()),                                                             x98,)),)),)),)),)),                                                  UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                    UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                      UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                        UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=19, src=()),                                                        x106:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (33, 34)), contiguous=False),)), src=()),)),                                                      UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                          UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=20, src=()),                                                           x106,)),)),)),)),)),                                                UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                  UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                    UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                      UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=21, src=()),                                                      x114:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (34, 35)), contiguous=False),)), src=()),)),                                                    UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                      UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                        UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=22, src=()),                                                         x114,)),)),)),)),)),                                              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=23, src=()),                                                UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-35, mask=((0, 1), (35, 39)), contiguous=False),)), src=()),)),)),                                            UOp(Ops.CAST, dtypes.float, arg=None, src=(                                              UOp(Ops.ADD, dtypes.float, arg=None, src=(                                                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                  UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=24, src=()),                                                  x125:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (39, 40)), contiguous=False),)), src=()),)),                                                UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                  UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                    UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=25, src=()),                                                     x125,)),)),)),)),)),                                          UOp(Ops.CAST, dtypes.float, arg=None, src=(                                            UOp(Ops.ADD, dtypes.float, arg=None, src=(                                              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=26, src=()),                                                x133:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (40, 41)), contiguous=False),)), src=()),)),                                              UOp(Ops.CAST, dtypes.float, arg=None, src=(                                                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                                  UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=27, src=()),                                                   x133,)),)),)),)),)),                                        UOp(Ops.ADD, dtypes.float, arg=None, src=(                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                            UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=28, src=()),                                            x140:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-41, mask=((0, 1), (41, 47)), contiguous=False),)), src=()),)),                                          UOp(Ops.CAST, dtypes.float, arg=None, src=(                                            UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                              UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=29, src=()),                                               x140,)),)),)),)),                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=30, src=()),                                          x147:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-47, mask=((0, 1), (47, 53)), contiguous=False),)), src=()),)),                                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=31, src=()),                                             x147,)),)),)),)),                                    UOp(Ops.CAST, dtypes.float, arg=None, src=(                                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=32, src=()),                                          x155:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (53, 54)), contiguous=False),)), src=()),)),                                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=33, src=()),                                             x155,)),)),)),)),)),                                  UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                    UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=34, src=()),                                    UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-54, mask=((0, 1), (54, 58)), contiguous=False),)), src=()),)),)),                                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                  UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=35, src=()),                                  UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-58, mask=((0, 1), (58, 62)), contiguous=False),)), src=()),)),)),                              UOp(Ops.CAST, dtypes.float, arg=None, src=(                                UOp(Ops.ADD, dtypes.float, arg=None, src=(                                  UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                    UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=36, src=()),                                    x169:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (62, 63)), contiguous=False),)), src=()),)),                                  UOp(Ops.CAST, dtypes.float, arg=None, src=(                                    UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                      UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=37, src=()),                                       x169,)),)),)),)),)),                            UOp(Ops.LOAD, dtypes.float, arg=None, src=(                              UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=38, src=()),                              UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-63, mask=((0, 1), (63, 67)), contiguous=False),)), src=()),)),)),                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=39, src=()),                            UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-67, mask=((0, 1), (67, 71)), contiguous=False),)), src=()),)),)),                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                          UOp(Ops.ADD, dtypes.float, arg=None, src=(                            UOp(Ops.LOAD, dtypes.float, arg=None, src=(                              UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=40, src=()),                              x183:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (71, 72)), contiguous=False),)), src=()),)),                            UOp(Ops.CAST, dtypes.float, arg=None, src=(                              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                                UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=41, src=()),                                 x183,)),)),)),)),)),                      UOp(Ops.CAST, dtypes.float, arg=None, src=(                        UOp(Ops.ADD, dtypes.float, arg=None, src=(                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                            UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=42, src=()),                            x191:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (72, 73)), contiguous=False),)), src=()),)),                          UOp(Ops.CAST, dtypes.float, arg=None, src=(                            UOp(Ops.LOAD, dtypes.float, arg=None, src=(                              UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=43, src=()),                               x191,)),)),)),)),)),                    UOp(Ops.CAST, dtypes.float, arg=None, src=(                      UOp(Ops.ADD, dtypes.float, arg=None, src=(                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=44, src=()),                          x199:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (73, 74)), contiguous=False),)), src=()),)),                        UOp(Ops.CAST, dtypes.float, arg=None, src=(                          UOp(Ops.LOAD, dtypes.float, arg=None, src=(                            UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=45, src=()),                             x199,)),)),)),)),)),                  UOp(Ops.CAST, dtypes.float, arg=None, src=(                    UOp(Ops.ADD, dtypes.float, arg=None, src=(                      UOp(Ops.LOAD, dtypes.float, arg=None, src=(                        UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=46, src=()),                        x207:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (74, 75)), contiguous=False),)), src=()),)),                      UOp(Ops.CAST, dtypes.float, arg=None, src=(                        UOp(Ops.LOAD, dtypes.float, arg=None, src=(                          UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=47, src=()),                           x207,)),)),)),)),)),                UOp(Ops.CAST, dtypes.float, arg=None, src=(                  UOp(Ops.ADD, dtypes.float, arg=None, src=(                    UOp(Ops.LOAD, dtypes.float, arg=None, src=(                      UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=48, src=()),                      x215:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (75, 76)), contiguous=False),)), src=()),)),                    UOp(Ops.CAST, dtypes.float, arg=None, src=(                      UOp(Ops.LOAD, dtypes.float, arg=None, src=(                        UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=49, src=()),                         x215,)),)),)),)),)),              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=50, src=()),                UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-76, mask=((0, 1), (76, 80)), contiguous=False),)), src=()),)),)),            UOp(Ops.CAST, dtypes.float, arg=None, src=(              UOp(Ops.ADD, dtypes.float, arg=None, src=(                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                  UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=51, src=()),                  x226:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (80, 81)), contiguous=False),)), src=()),)),                UOp(Ops.CAST, dtypes.float, arg=None, src=(                  UOp(Ops.LOAD, dtypes.float, arg=None, src=(                    UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=52, src=()),                     x226,)),)),)),)),)),          UOp(Ops.CAST, dtypes.float, arg=None, src=(            UOp(Ops.ADD, dtypes.float, arg=None, src=(              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=53, src=()),                x234:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (81, 82)), contiguous=False),)), src=()),)),              UOp(Ops.CAST, dtypes.float, arg=None, src=(                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                  UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=54, src=()),                   x234,)),)),)),)),)),        UOp(Ops.CAST, dtypes.float, arg=None, src=(          UOp(Ops.ADD, dtypes.float, arg=None, src=(            UOp(Ops.ADD, dtypes.float, arg=None, src=(              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=55, src=()),                x243:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (82, 83)), contiguous=False),)), src=()),)),              UOp(Ops.CAST, dtypes.float, arg=None, src=(                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                  UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=56, src=()),                   x243,)),)),)),            UOp(Ops.ADD, dtypes.float, arg=None, src=(              UOp(Ops.LOAD, dtypes.float, arg=None, src=(                UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=57, src=()),                x250:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (83, 84)), contiguous=False),)), src=()),)),              UOp(Ops.CAST, dtypes.float, arg=None, src=(                UOp(Ops.LOAD, dtypes.float, arg=None, src=(                  UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=58, src=()),                   x250,)),)),)),)),)),)),      UOp(Ops.CAST, dtypes.float, arg=None, src=(        UOp(Ops.LOAD, dtypes.float, arg=None, src=(          UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 128, 4)), arg=59, src=()),          UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-84, mask=((0, 1), (84, 596)), contiguous=False),)), src=()),)),)),)),)),)) # noqa: E501
 
     opts = [Opt(op=OptOps.UPCAST, axis=0, arg=4)]
-    helper_test_lin(Kernel(ast), opts, failed_platforms=[], validate_device=Device["GPU"])
+    helper_test_lin(Kernel(ast), opts, failed_platforms=[], validate_device=Device["CL"])
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/external/external_test_image.py b/test/external/external_test_image.py
index 1c2cc397f2..4f2e06d903 100644
--- a/test/external/external_test_image.py
+++ b/test/external/external_test_image.py
@@ -4,7 +4,7 @@ import unittest
 import numpy as np
 if 'IMAGE' not in os.environ:
   os.environ['IMAGE'] = '2'
-os.environ['GPU'] = '1'
+os.environ['CL'] = '1'
 os.environ['OPT'] = '2'
 from tinygrad.tensor import Tensor
 from tinygrad.nn import Conv2d
diff --git a/test/external/external_test_onnx_backend.py b/test/external/external_test_onnx_backend.py
index d48154b85d..112ccd797c 100644
--- a/test/external/external_test_onnx_backend.py
+++ b/test/external/external_test_onnx_backend.py
@@ -193,12 +193,12 @@ backend_test.exclude('test_adam_cpu')
 backend_test.exclude('test_gradient_of_add_and_mul_cpu')
 backend_test.exclude('test_gradient_of_add_cpu')
 
-if Device.DEFAULT in ['GPU', 'METAL']:
+if Device.DEFAULT in ['CL', 'METAL']:
   backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
   backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
   backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
 
-if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "GPU"):
+if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "CL"):
   # numerical inaccuracy
   backend_test.exclude('test_mish_cpu')
   backend_test.exclude('test_mish_expanded_cpu')
diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py
index 584c3af89f..f87206be2b 100644
--- a/test/external/external_test_opt.py
+++ b/test/external/external_test_opt.py
@@ -34,7 +34,7 @@ from extra.models.efficientnet import EfficientNet
 from extra.models.resnet import ResNet18
 from extra.models.vit import ViT
 
-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestInferenceMinKernels(unittest.TestCase):
   def setUp(self):
     self.training_old = Tensor.training
@@ -90,7 +90,7 @@ class TestInferenceMinKernels(unittest.TestCase):
     with CLCache(100):
       model(inp, 0).realize()
 
-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOptBinOp(unittest.TestCase):
   def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
     a = Tensor.randn(16, 16)
@@ -117,7 +117,7 @@ class TestOptBinOp(unittest.TestCase):
   #def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
   #def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
 
-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOptReduceLoop(unittest.TestCase):
   def test_loop_left(self):
     a = Tensor.randn(16, 16)
@@ -139,7 +139,7 @@ class TestOptReduceLoop(unittest.TestCase):
       c.realize()
       assert cache.count == 2, "loop right fusion broken"
 
-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOptWChild(unittest.TestCase):
   @unittest.skip("this no longer happens, use realize")
   def test_unrealized_child(self):
@@ -152,7 +152,7 @@ class TestOptWChild(unittest.TestCase):
       d.realize()
       assert cache.count == 2, "don't fuse if you have children"
 
-@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
 class TestOpt(unittest.TestCase):
   def test_muladd(self):
     a,b,c = [Tensor.randn(2,2).realize() for _ in range(3)]
diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py
index e5242ac605..19ed23f5d6 100644
--- a/test/external/fuzz_linearizer.py
+++ b/test/external/fuzz_linearizer.py
@@ -16,7 +16,7 @@ if os.getenv("VALIDATE_HCQ", 0) != 0:
   try:
     import extra.qcom_gpu_driver.opencl_ioctl
     from tinygrad import Device
-    _, _ = Device["QCOM"], Device["GPU"]
+    _, _ = Device["QCOM"], Device["CL"]
   except Exception: pass
 
 from tinygrad import Tensor, Device, dtypes
@@ -42,9 +42,9 @@ if getenv("VALIDATE_HCQ"):
     on_linearizer_did_run = extra.nv_gpu_driver.nv_ioctl.collect_last_launch_state
     compare_states = extra.nv_gpu_driver.nv_ioctl.compare_launch_state
   elif Device.DEFAULT == "QCOM":
-    print("VALIDATE_HCQ: Comparing QCOM to GPU")
+    print("VALIDATE_HCQ: Comparing QCOM to CL")
     import extra.qcom_gpu_driver.opencl_ioctl
-    validate_device = Device["GPU"]
+    validate_device = Device["CL"]
     on_linearizer_will_run = extra.qcom_gpu_driver.opencl_ioctl.before_launch
     on_linearizer_did_run = extra.qcom_gpu_driver.opencl_ioctl.collect_last_launch_state
     compare_states = extra.qcom_gpu_driver.opencl_ioctl.compare_launch_state
@@ -302,7 +302,7 @@ if __name__ == "__main__":
     for i, ast in enumerate(ast_strs[:getenv("FUZZ_N", len(ast_strs))]):
       if (nth := getenv("FUZZ_NTH", -1)) != -1 and i != nth: continue
       if getenv("FUZZ_IMAGEONLY") and "dtypes.image" not in ast: continue
-      if "dtypes.image" in ast and Device.DEFAULT not in {"GPU", "QCOM"}: continue  # IMAGE is only for GPU
+      if "dtypes.image" in ast and Device.DEFAULT not in {"CL", "QCOM"}: continue  # IMAGE is only for CL
       if ast in seen_ast_strs: continue
       seen_ast_strs.add(ast)
 
diff --git a/test/helpers.py b/test/helpers.py
index 4833f425d9..cee64595f3 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -57,8 +57,8 @@ def eval_uop(uop:UOp, inputs:list[tuple[DType, list[Any]]]|None=None):
   return out_buf.cast(uop.dtype.fmt).tolist()[0]
 
 def not_support_multi_device():
-  # GPU and CUDA don't support multi device if in CI
-  return CI and REAL_DEV in ("GPU", "CUDA")
+  # CL and CUDA don't support multi device if in CI
+  return CI and REAL_DEV in ("CL", "CUDA")
 
 # NOTE: This will open REMOTE if it's the default device
 REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index c55aee0ad8..26e0ee760d 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -114,7 +114,7 @@ class TestRealWorld(unittest.TestCase):
 
       helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.07, 93)
 
-  @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "GPU"}, "slow")
+  @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
   def test_train_cifar(self):
     with Tensor.train():
       model = SpeedyResNet(Tensor.ones((12,3,2,2)))
diff --git a/test/models/test_train.py b/test/models/test_train.py
index 605e6f6de1..972c491923 100644
--- a/test/models/test_train.py
+++ b/test/models/test_train.py
@@ -27,7 +27,7 @@ def train_one_step(model,X,Y):
   print("done in %.2f ms" % (et*1000.))
 
 def check_gc():
-  if Device.DEFAULT == "GPU":
+  if Device.DEFAULT == "CL":
     from extra.introspection import print_objects
     assert print_objects() == 0
 
diff --git a/test/opt/test_kernel_opts.py b/test/opt/test_kernel_opts.py
index 25951a01af..c0c8865146 100644
--- a/test/opt/test_kernel_opts.py
+++ b/test/opt/test_kernel_opts.py
@@ -93,7 +93,7 @@ class TestKernelOpts(unittest.TestCase):
     a = Tensor.rand(8, N, 8, N)
     r = a.sum(axis=(1,3))
     helper_linearizer_opt(r, [
-      # openCL / GPU=1 is 256 max threads
+      # openCL / CL=1 is 256 max threads
       [Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)],
       [Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce.
       [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)],
diff --git a/test/speed/external_test_copy_speed.py b/test/speed/external_test_copy_speed.py
index 351c7d993a..359a2499ac 100644
--- a/test/speed/external_test_copy_speed.py
+++ b/test/speed/external_test_copy_speed.py
@@ -77,9 +77,9 @@ class TestCopySpeed(unittest.TestCase):
       np.testing.assert_equal(t.numpy(), x.numpy())
 
   @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
-  @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
+  @unittest.skipIf(Device.DEFAULT != "CL", "only test this on CL")
   def testCopyCPUto6GPUs(self):
-    from tinygrad.runtime.ops_gpu import CLDevice
+    from tinygrad.runtime.ops_cl import CLDevice
     if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
     t = Tensor.ones(N, N, device="CPU").contiguous().realize()
     print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
@@ -87,8 +87,8 @@ class TestCopySpeed(unittest.TestCase):
       with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
         with Timing("queue: "):
           for g in range(6):
-            t.to(f"gpu:{g}").realize()
-        Device["gpu"].synchronize()
+            t.to(f"CL:{g}").realize()
+        Device["CL"].synchronize()
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_dtype.py b/test/test_dtype.py
index 3f007783a1..31e4472bf5 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -424,7 +424,7 @@ class TestDtypeUsage(unittest.TestCase):
 class TestOpsBFloat16(unittest.TestCase):
   def test_cast(self):
     # TODO: helper_test_op breaks in unrelated part
-    # TODO: wrong output with GPU=1 on mac
+    # TODO: wrong output with CL=1 on mac
     data = [60000.0, 70000.0, 80000.0]
     np.testing.assert_allclose(Tensor(data).cast("bfloat16").numpy(), torch.tensor(data).type(torch.bfloat16).float().numpy())
 
diff --git a/test/test_image_dtype.py b/test/test_image_dtype.py
index 08d2c04c32..6adab73e51 100644
--- a/test/test_image_dtype.py
+++ b/test/test_image_dtype.py
@@ -7,7 +7,7 @@ from tinygrad.engine.realize import lower_schedule
 from tinygrad.helpers import prod, unwrap
 from test.helpers import REAL_DEV
 
-IMAGE_SUPPORTED_DEVICES = ("QCOM", "GPU")
+IMAGE_SUPPORTED_DEVICES = ("QCOM", "CL")
 
 @unittest.skipUnless(REAL_DEV in IMAGE_SUPPORTED_DEVICES, "Images not supported")
 class TestImageCopy(unittest.TestCase):
diff --git a/test/test_ops.py b/test/test_ops.py
index eb22cf21bb..dc3952d519 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1304,7 +1304,7 @@ class TestOps(unittest.TestCase):
                                                                          np.arange(64,128,dtype=np.float32).reshape(8,8)])
   def test_small_gemm_eye(self):
     helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
-  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "GPU", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
+  @unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
   or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
   def test_gemm_fp16(self):
     helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3)
diff --git a/test/test_opts.py b/test/test_opts.py
index 2c0de53199..4a6310ef32 100644
--- a/test/test_opts.py
+++ b/test/test_opts.py
@@ -13,7 +13,7 @@ class TestOpts(unittest.TestCase):
     out = (a+b).contiguous(arg=opts)
     s = out.schedule()
     self.assertEqual(s[-1].ast.arg.opts_to_apply, opts)
-    if Device.DEFAULT in {"CPU", "GPU", "METAL"} and not CPU_LLVM:
+    if Device.DEFAULT in {"CPU", "CL", "METAL"} and not CPU_LLVM:
       prg = get_program(s[-1].ast)
       self.assertIn('float4', prg.src)
 
diff --git a/test/test_schedule.py b/test/test_schedule.py
index a895ed1a0d..9e00835460 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -1654,7 +1654,7 @@ class TestSchedule(unittest.TestCase):
     constv = Tensor.empty(2, 2).uop.const_like(10).contiguous()
     check_schedule(constv, 1)
 
-  @unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU")
+  @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
   def test_image_matmul(self):
     with Context(IMAGE=2):
       x = Tensor.randn((9, 9)).realize()
diff --git a/test/test_tiny.py b/test/test_tiny.py
index bf0f903ff2..78d1517522 100644
--- a/test/test_tiny.py
+++ b/test/test_tiny.py
@@ -137,7 +137,7 @@ class TestTiny(unittest.TestCase):
 
   # *** image ***
 
-  @unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU")
+  @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
   def test_image(self):
     with Context(IMAGE=2): self.test_gemm(N=4, out_dtype=dtypes.imagef((4, 1, 4)))
 
diff --git a/test/test_uops.py b/test/test_uops.py
index 64b9abe2de..4a7ed30880 100644
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -513,7 +513,7 @@ class TestUOpStr(unittest.TestCase):
     assert str(eval(str(vec))) == str(vec)
 
   def test_device_arg(self):
-    device = UOp(Ops.DEVICE, arg="GPU")
+    device = UOp(Ops.DEVICE, arg="CL")
     assert str(eval(str(device))) == str(device)
 
   def test_reduceop_arg(self):
diff --git a/test/unit/test_device.py b/test/unit/test_device.py
index 1c1b9f7997..8ab43a17f0 100644
--- a/test/unit/test_device.py
+++ b/test/unit/test_device.py
@@ -9,12 +9,12 @@ class TestDevice(unittest.TestCase):
     self.assertEqual(Device.canonicalize(None), Device.DEFAULT)
     self.assertEqual(Device.canonicalize("CPU"), "CPU")
     self.assertEqual(Device.canonicalize("cpu"), "CPU")
-    self.assertEqual(Device.canonicalize("GPU"), "GPU")
-    self.assertEqual(Device.canonicalize("GPU:0"), "GPU")
-    self.assertEqual(Device.canonicalize("gpu:0"), "GPU")
-    self.assertEqual(Device.canonicalize("GPU:1"), "GPU:1")
-    self.assertEqual(Device.canonicalize("gpu:1"), "GPU:1")
-    self.assertEqual(Device.canonicalize("GPU:2"), "GPU:2")
+    self.assertEqual(Device.canonicalize("CL"), "CL")
+    self.assertEqual(Device.canonicalize("CL:0"), "CL")
+    self.assertEqual(Device.canonicalize("cl:0"), "CL")
+    self.assertEqual(Device.canonicalize("CL:1"), "CL:1")
+    self.assertEqual(Device.canonicalize("cl:1"), "CL:1")
+    self.assertEqual(Device.canonicalize("CL:2"), "CL:2")
     self.assertEqual(Device.canonicalize("disk:/dev/shm/test"), "DISK:/dev/shm/test")
     self.assertEqual(Device.canonicalize("disk:000.txt"), "DISK:000.txt")
 
diff --git a/test/unit/test_indexing.py b/test/unit/test_indexing.py
index da5d61944c..c9d6d7c7da 100644
--- a/test/unit/test_indexing.py
+++ b/test/unit/test_indexing.py
@@ -181,7 +181,7 @@ class TestIndexing(unittest.TestCase):
     # self.assertRaises(TypeError, delitem)
 
   # TODO: LLVM is quite fast, why are other compiled backends slow?
-  @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "GPU", "METAL", "NV", "AMD"], "slow")
+  @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow")
   def test_advancedindex(self):
     # integer array indexing
 
diff --git a/test/unit/test_simplify_valid_idx.py b/test/unit/test_simplify_valid_idx.py
index d75c872209..359f7d108f 100644
--- a/test/unit/test_simplify_valid_idx.py
+++ b/test/unit/test_simplify_valid_idx.py
@@ -359,7 +359,7 @@ class TestImageSimplification(unittest.TestCase):
     self.check(load, None, "((gidx*3)+-1438)", "0")
 
   def test_simplify2(self):
-    # from GPU=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d
+    # from CL=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d
     lidx = Special("lidx", 4)
     valid = (lidx<3) & (lidx<1).ne(True)
     idx = ((lidx+1)%2, (lidx+1)//2-1)
diff --git a/tinygrad/device.py b/tinygrad/device.py
index 483dc9fe35..a69c5316d7 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -11,7 +11,7 @@ from tinygrad.renderer import Renderer
 
 # **************** Device ****************
 
-ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CPU", "DSP", "WEBGPU"]
+ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "CL", "CPU", "DSP", "WEBGPU"]
 class _Device:
   def __init__(self) -> None:
     self._devices = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
@@ -336,11 +336,11 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool:
   # CI CUDA architecture is sm_35 but we need at least sm_70 to run fp16 ALUs
   # PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751
   if dtype == dtypes.half:
-    if device == "GPU": return not CI and not OSX
+    if device == "CL": return not CI and not OSX
     if device in ["CUDA", "NV"]: return not CI
     if device == "CPU" and CPU_LLVM: return OSX
     if device == "PYTHON": return sys.version_info >= (3, 12)
-  if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "GPU")
+  if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "CL")
   return True
 
 if PROFILE:
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index b268ce7216..20e3b8ec02 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -242,7 +242,7 @@ class ClangRenderer(CStyleLanguage):
     return defines + "\n" + self._render_body(function_name, kernel, bufs, uops, prefix) + "\n" + self._render_entry(function_name, bufs)
 
 class OpenCLRenderer(CStyleLanguage):
-  device = "GPU"
+  device = "CL"
 
   # language options
   kernel_typedef = "__kernel void"
@@ -271,7 +271,7 @@ class OpenCLRenderer(CStyleLanguage):
     return super().render_kernel(function_name, kernel, bufs, uops, prefix)
 
 class IntelRenderer(OpenCLRenderer):
-  device, suffix, kernel_typedef = "GPU", "INTEL", "__attribute__((intel_reqd_sub_group_size(8)))\n" + "__kernel void"
+  device, suffix, kernel_typedef = "CL", "INTEL", "__attribute__((intel_reqd_sub_group_size(8)))\n" + "__kernel void"
   tensor_cores = tc.intel
 
   string_rewrite = PatternMatcher([
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_cl.py
similarity index 100%
rename from tinygrad/runtime/ops_gpu.py
rename to tinygrad/runtime/ops_cl.py
diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py
index f3e7dc899f..787d349ee9 100644
--- a/tinygrad/runtime/ops_qcom.py
+++ b/tinygrad/runtime/ops_qcom.py
@@ -7,7 +7,7 @@ from tinygrad.device import BufferSpec
 from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
 from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
 from tinygrad.runtime.autogen import kgsl, adreno
-from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
+from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
 from tinygrad.renderer.cstyle import QCOMRenderer
 from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport
 if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl  # noqa: F401  # pylint: disable=unused-import