diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a4c707c2db..6d3f329f91 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -330,8 +330,8 @@ jobs: - name: Fuzz Test shape ops run: python test/external/fuzz_shape_ops.py - testgpuimage: - name: 'GPU IMAGE Tests' + testopenclimage: + name: 'CL IMAGE Tests' runs-on: ubuntu-22.04 timeout-minutes: 10 env: @@ -345,15 +345,15 @@ jobs: key: gpu-image deps: testing_minimal opencl: 'true' - - name: Test GPU IMAGE=2 ops + training + - name: Test CL IMAGE=2 ops + training run: | - GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20 - GPU=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist + CL=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20 + CL=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist - name: Run process replay tests uses: ./.github/actions/process-replay testgpumisc: - name: 'GPU Misc tests' + name: 'CL Misc tests' runs-on: ubuntu-22.04 timeout-minutes: 10 env: @@ -368,11 +368,11 @@ jobs: deps: testing_minimal opencl: 'true' - name: Generate Dataset - run: GPU=1 extra/optimization/generate_dataset.sh + run: CL=1 extra/optimization/generate_dataset.sh - name: Run Kernel Count Test - run: GPU=1 python -m pytest -n=auto test/external/external_test_opt.py + run: CL=1 python -m pytest -n=auto test/external/external_test_opt.py - name: Run fused optimizer tests - run: GPU=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py + run: CL=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py - name: Upload artifact uses: actions/upload-artifact@v4 with: @@ -397,17 +397,17 @@ jobs: llvm: 'true' - name: Test openpilot model kernel count and gate usage run: | - ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx + ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx - name: Test openpilot alt model correctness (float32) - run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx + run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx - name: Test openpilot fastvits model correctness (float32) - run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx + run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx # - name: Test openpilot simple_plan vision model correctness (float32) - # run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b + # run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b - name: Test openpilot LLVM compile run: CPU=1 CPU_LLVM=1 LLVMOPT=1 JIT=2 BEAM=0 IMAGE=0 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx - name: Test openpilot compile4 - run: NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py + run: NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py - name: Run process replay tests uses: ./.github/actions/process-replay @@ -459,16 +459,16 @@ jobs: pydeps: "tensorflow==2.15.1 tensorflow_addons" python-version: '3.11' opencl: 'true' - - name: Test ONNX (GPU) - run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 + - name: Test ONNX (CL) + run: CL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 #- name: Test Optimization Helpers # run: DEBUG=1 python3 extra/optimization/test_helpers.py #- name: Test Action Space - # run: DEBUG=1 GPU=1 python3 extra/optimization/get_action_space.py + # run: DEBUG=1 CL=1 python3 extra/optimization/get_action_space.py - name: Test Beam Search - run: GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py + run: CL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py - name: Test MLPerf stuff - run: GPU=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20 + run: CL=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20 - name: Test llama 3 training run: MAX_BUFFER_SIZE=0 DEV=NULL SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py - name: Run process replay tests @@ -506,8 +506,8 @@ jobs: llvm: 'true' - name: Test models (llvm) run: CPU=1 CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20 - - name: Test models (gpu) - run: GPU=1 python -m pytest -n=auto test/models --durations=20 + - name: Test models (opencl) + run: CL=1 python -m pytest -n=auto test/models --durations=20 - name: Test models (cpu) run: CPU=1 CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20 - name: Run process replay tests @@ -709,7 +709,7 @@ jobs: strategy: fail-fast: false matrix: - backend: [llvm, cpu, gpu] + backend: [llvm, cpu, opencl] name: Linux (${{ matrix.backend }}) runs-on: ubuntu-22.04 @@ -725,13 +725,13 @@ jobs: with: key: ${{ matrix.backend }}-minimal deps: testing_minimal - opencl: ${{ matrix.backend == 'gpu' && 'true' }} + opencl: ${{ matrix.backend == 'opencl' && 'true' }} llvm: ${{ matrix.backend == 'llvm' && 'true' }} - name: Set env - run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'gpu' && 'GPU=1' }}" >> $GITHUB_ENV + run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'CL=1' }}" >> $GITHUB_ENV - name: Check Device.DEFAULT and print some source run: | - python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','GPU'], Device.DEFAULT" + python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT" DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run pytest (${{ matrix.backend }}) run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20 @@ -772,7 +772,7 @@ jobs: start_server "remote-server-amd-1" "AMD" 6667 start_server "remote-server-amd-2" "AMD" 6668 - start_server "remote-server-gpu" "GPU" 7667 + start_server "remote-server-gpu" "CL" 7667 start_server "remote-server-cpu" "CPU" 8667 - name: Check Device.DEFAULT and print some source env: @@ -786,7 +786,7 @@ jobs: HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6 run: | python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20 - - name: Run REMOTE=1 Test (GPU) + - name: Run REMOTE=1 Test (CL) env: HOST: 127.0.0.1:7667*6 run: | diff --git a/README.md b/README.md index 262ea97d37..dab378a23a 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers tinygrad already supports numerous accelerators, including: -- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py) +- [x] [OpenCL](tinygrad/runtime/ops_cl.py) - [x] [CPU](tinygrad/runtime/ops_cpu.py) - [x] [METAL](tinygrad/runtime/ops_metal.py) - [x] [CUDA](tinygrad/runtime/ops_cuda.py) diff --git a/docs/env_vars.md b/docs/env_vars.md index 9367eef064..44be042bfa 100644 --- a/docs/env_vars.md +++ b/docs/env_vars.md @@ -3,7 +3,7 @@ This is a list of environment variable that control the runtime behavior of tinygrad and its examples. Most of these are self-explanatory, and are usually used to set an option at runtime. -Example: `GPU=1 DEBUG=4 python3 -m pytest` +Example: `CL=1 DEBUG=4 python3 -m pytest` However you can also decorate a function to set a value only inside that function. @@ -31,7 +31,7 @@ These control the behavior of core tinygrad even when used as a library. Variable | Possible Value(s) | Description ---|---|--- DEBUG | [1-7] | enable debugging output (operations, timings, speed, generated code and more) -GPU | [1] | enable the GPU (OpenCL) backend +CL | [1] | enable OpenCL backend CUDA | [1] | enable CUDA backend AMD | [1] | enable AMD backend NV | [1] | enable NV backend diff --git a/docs/runtime.md b/docs/runtime.md index bc85d9bedf..656c6d75c3 100644 --- a/docs/runtime.md +++ b/docs/runtime.md @@ -9,7 +9,7 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra | [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | 6xx series GPUs | | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support | | [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support | -| [GPU (OpenCL)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_gpu.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device | +| [OpenCL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device | | [CPU (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` | | [LLVM (LLVM IR)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_llvm.py) | Runs on CPU using the LLVM compiler infrastructure | llvm libraries installed and findable | | [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | Dawn library installed and findable. Download binaries [here](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0). | diff --git a/examples/openpilot/compile4.py b/examples/openpilot/compile4.py index c57bd3eb70..55fcccbfbf 100644 --- a/examples/openpilot/compile4.py +++ b/examples/openpilot/compile4.py @@ -6,7 +6,7 @@ from tinygrad.schedule.kernelize import get_kernelize_map from tinygrad.engine.schedule import create_schedule_with_vars from tinygrad.engine.realize import run_schedule -# NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py +# NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx" OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/openpilot.pkl" diff --git a/extra/archprobe.py b/extra/archprobe.py index 73eb5037fb..7ba20b2a88 100644 --- a/extra/archprobe.py +++ b/extra/archprobe.py @@ -1,7 +1,7 @@ # copying the kernels from https://github.com/microsoft/ArchProbe into Python import numpy as np import pickle -from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer +from tinygrad.runtime.ops_cl import CLProgram, CLBuffer from tinygrad import dtypes from tqdm import trange, tqdm from matplotlib import pyplot as plt diff --git a/extra/assembly/assembly_rdna.py b/extra/assembly/assembly_rdna.py index 0f5ab01ecf..297639d676 100644 --- a/extra/assembly/assembly_rdna.py +++ b/extra/assembly/assembly_rdna.py @@ -4,7 +4,7 @@ from tinygrad import dtypes from tinygrad.codegen.assembly import AssemblyCodegen, Register from tinygrad.codegen.opt.kernel import Ops from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps -from tinygrad.runtime.ops_gpu import ROCM_LLVM_PATH +from tinygrad.runtime.ops_cl import ROCM_LLVM_PATH # ugh, is this really needed? from extra.helpers import enable_early_exec diff --git a/extra/assembly/rocm/rdna3/asm.py b/extra/assembly/rocm/rdna3/asm.py index 2f6ad13264..9c65fa7360 100644 --- a/extra/assembly/rocm/rdna3/asm.py +++ b/extra/assembly/rocm/rdna3/asm.py @@ -5,7 +5,7 @@ from tinygrad.helpers import colored from extra.helpers import enable_early_exec early_exec = enable_early_exec() -from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH +from tinygrad.runtime.ops_cl import CLProgram, CLBuffer, ROCM_LLVM_PATH ENABLE_NON_ASM = False diff --git a/extra/export_model.py b/extra/export_model.py index 65a8b3af9f..e29f8a8d31 100644 --- a/extra/export_model.py +++ b/extra/export_model.py @@ -10,7 +10,7 @@ from tinygrad.uop.ops import Ops import json from collections import OrderedDict -EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "GPU"] +EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "CL"] def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str],List[Tuple[str,List[str],List[int]]],Dict[str,Tuple[int,DType,int]],Dict[str,Tensor]]: functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0 diff --git a/extra/gemm/intel_xmx.py b/extra/gemm/intel_xmx.py index 8ec478e5f6..719830473a 100644 --- a/extra/gemm/intel_xmx.py +++ b/extra/gemm/intel_xmx.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 import numpy as np -from tinygrad.runtime.ops_gpu import CLProgram, CLCompiler +from tinygrad.runtime.ops_cl import CLProgram, CLCompiler from tinygrad import Device, dtypes from tinygrad.device import Buffer from hexdump import hexdump @@ -11,7 +11,7 @@ from hexdump import hexdump # https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_split_matrix_multiply_accumulate.html # https://hc34.hotchips.org/assets/program/conference/day1/GPU%20HPC/Intel_s%20Ponte%20Vecchio%20GPU%20-%20Architecture%20Systems%20and%20Software%20FINAL.pdf -device = Device["GPU"] +device = Device["CL"] # NOTE: only the subgroup type 8 ones work prog = CLProgram(device, "test", CLCompiler(device, "test").compile(f""" @@ -26,9 +26,9 @@ __kernel void test(__global float* data0, const __global int* data1, const __glo """)) #with open("/tmp/test.elf", "wb") as f: f.write(prog.lib) -a = Buffer("GPU", 8, dtypes.float32).allocate() -b = Buffer("GPU", 0x10, dtypes.float16).allocate() -c = Buffer("GPU", 8*0x10, dtypes.float16).allocate() +a = Buffer("CL", 8, dtypes.float32).allocate() +b = Buffer("CL", 0x10, dtypes.float16).allocate() +c = Buffer("CL", 8*0x10, dtypes.float16).allocate() row = np.array([1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8], np.float16) mat = np.random.random((8, 0x10)).astype(np.float16) diff --git a/extra/optimization/generate_dataset.sh b/extra/optimization/generate_dataset.sh index 6f70916979..b843dac700 100755 --- a/extra/optimization/generate_dataset.sh +++ b/extra/optimization/generate_dataset.sh @@ -7,7 +7,7 @@ rm $LOGOPS test/external/process_replay/reset.py CI=1 python3 -m pytest -n=auto test/test_ops.py test/test_nn.py test/test_winograd.py test/models/test_real_world.py --durations=20 -GPU=1 python3 -m pytest test/test_tiny.py +CL=1 python3 -m pytest test/test_tiny.py # extract, sort and uniq extra/optimization/extract_dataset.py diff --git a/extra/qcom_gpu_driver/qcom_opencl_interop.py b/extra/qcom_gpu_driver/qcom_opencl_interop.py index d595ba343f..c2e0741ca2 100644 --- a/extra/qcom_gpu_driver/qcom_opencl_interop.py +++ b/extra/qcom_gpu_driver/qcom_opencl_interop.py @@ -1,6 +1,6 @@ import ctypes, array from hexdump import hexdump -from tinygrad.runtime.ops_gpu import GPUDevice +from tinygrad.runtime.ops_cl import CLDevice from tinygrad.helpers import getenv, to_mv, mv_address from tinygrad.dtype import dtypes from tinygrad import Tensor, TinyJit @@ -8,7 +8,7 @@ from tinygrad.runtime.autogen import opencl as cl if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import # create raw opencl buffer. -gdev = GPUDevice() +gdev = CLDevice() cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 0x100, None, status := ctypes.c_int32()) assert status.value == 0 diff --git a/extra/thneed.py b/extra/thneed.py index c59f636858..ca89bfa603 100644 --- a/extra/thneed.py +++ b/extra/thneed.py @@ -4,13 +4,13 @@ import struct import json import traceback import numpy as np -from tinygrad.runtime.ops_gpu import CLProgram, compile_gpu +from tinygrad.runtime.ops_cl import CLProgram, compile_gpu from tinygrad.device import Device from tinygrad.helpers import DEBUG, getenv from collections import defaultdict import pyopencl as cl -from tinygrad.runtime.ops_gpu import OSX_TIMING_RATIO -CL = Device["GPU"] +from tinygrad.runtime.ops_cl import OSX_TIMING_RATIO +CL = Device["CL"] DEBUGCL = getenv("DEBUGCL", 0) FLOAT16 = getenv("FLOAT16", 0) @@ -110,7 +110,7 @@ class Thneed: prgs = {} for o in jdat['binaries']: nptr = ptr + o['length'] - prgs[o['name']] = CLProgram(Device["GPU"], o['name'], weights[ptr:nptr]) + prgs[o['name']] = CLProgram(Device["CL"], o['name'], weights[ptr:nptr]) ptr = nptr # populate the cl_cache @@ -267,7 +267,7 @@ class Thneed: for prg, args in self.cl_cache: events.append(prg.clprg(CL.queue, *args)) mt = time.monotonic() - Device["GPU"].synchronize() + Device["CL"].synchronize() et = time.monotonic() - st print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms") diff --git a/test/device/test_ocl.py b/test/device/test_ocl.py index 04b8e2523e..6f58b909db 100644 --- a/test/device/test_ocl.py +++ b/test/device/test_ocl.py @@ -3,9 +3,9 @@ from tinygrad import Device from tinygrad.device import Buffer from tinygrad.dtype import dtypes from tinygrad.helpers import CI -from tinygrad.runtime.ops_gpu import CLDevice, CLAllocator, CLCompiler, CLProgram +from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram -@unittest.skipUnless(Device.DEFAULT == "GPU", "Runs only on OpenCL (GPU)") +@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL") class TestCLError(unittest.TestCase): @unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory") def test_oom(self): @@ -24,7 +24,7 @@ class TestCLError(unittest.TestCase): def test_unaligned_copy(self): data = list(range(65)) unaligned = memoryview(bytearray(data))[1:] - buffer = Buffer("GPU", 64, dtypes.uint8).allocate() + buffer = Buffer("CL", 64, dtypes.uint8).allocate() buffer.copyin(unaligned) result = memoryview(bytearray(len(data) - 1)) buffer.copyout(result) diff --git a/test/external/external_benchmark_hip_compile.py b/test/external/external_benchmark_hip_compile.py index 2b1d480348..d97047b923 100644 --- a/test/external/external_benchmark_hip_compile.py +++ b/test/external/external_benchmark_hip_compile.py @@ -1,7 +1,7 @@ import random, os from tinygrad.helpers import Timing from tinygrad.runtime.ops_hip import compile_hip, HIPDevice -from tinygrad.runtime.ops_gpu import compile_cl, CLDevice +from tinygrad.runtime.ops_cl import compile_cl, CLDevice # OMP_NUM_THREADS=1 strace -tt -f -e trace=file python3 test/external/external_benchmark_hip_compile.py # AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 python3 test/external/external_benchmark_hip_compile.py diff --git a/test/external/external_cl_half_max.py b/test/external/external_cl_half_max.py index 7cd6b0c509..020d806c63 100644 --- a/test/external/external_cl_half_max.py +++ b/test/external/external_cl_half_max.py @@ -1,4 +1,4 @@ -from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, compile_cl +from tinygrad.runtime.ops_cl import CLDevice, CLProgram, compile_cl if __name__ == "__main__": dev = CLDevice() diff --git a/test/external/external_gpu_fail_osx.py b/test/external/external_gpu_fail_osx.py index b11b695e3d..51a458b136 100644 --- a/test/external/external_gpu_fail_osx.py +++ b/test/external/external_gpu_fail_osx.py @@ -1,5 +1,5 @@ # ugh, OS X OpenCL doesn't support half -from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, CLCompiler +from tinygrad.runtime.ops_cl import CLDevice, CLProgram, CLCompiler src = """#pragma OPENCL EXTENSION cl_khr_fp16 : enable __kernel void max_half(__global half* data0, const __global half* data1) { diff --git a/test/external/external_multi_gpu.py b/test/external/external_multi_gpu.py index 32d107df7d..b3c8fefb30 100644 --- a/test/external/external_multi_gpu.py +++ b/test/external/external_multi_gpu.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git -# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py +# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so CL=1 python3 test/external/external_multi_gpu.py import numpy as np from tinygrad.tensor import Tensor from tinygrad.helpers import colored, Timing, getenv diff --git a/test/external/external_osx_profiling.py b/test/external/external_osx_profiling.py index 8ac1e584df..277ca304ad 100644 --- a/test/external/external_osx_profiling.py +++ b/test/external/external_osx_profiling.py @@ -1,4 +1,4 @@ -from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer +from tinygrad.runtime.ops_cl import CLProgram, CL, CLBuffer from tinygrad import dtypes import time diff --git a/test/external/external_test_hcq_fuzz_failures.py b/test/external/external_test_hcq_fuzz_failures.py index c8b6198e9a..4a381d5336 100644 --- a/test/external/external_test_hcq_fuzz_failures.py +++ b/test/external/external_test_hcq_fuzz_failures.py @@ -55,7 +55,7 @@ class TestHCQFuzzFailures(unittest.TestCase): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=0, mask=None, contiguous=True),)), src=()), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=1, src=()), x39:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=0, mask=((0, 1), (0, 6)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=2, src=()), x39,)),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=3, src=()), x46:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-6, mask=((0, 1), (6, 12)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=4, src=()), x46,)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=5, src=()), x54:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (12, 13)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=6, src=()), x54,)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=7, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-13, mask=((0, 1), (13, 17)), contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=8, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-17, mask=((0, 1), (17, 21)), contiguous=False),)), src=()),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=9, src=()), x68:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (21, 22)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=10, src=()), x68,)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=11, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-22, mask=((0, 1), (22, 26)), contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=12, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-26, mask=((0, 1), (26, 30)), contiguous=False),)), src=()),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=13, src=()), x82:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (30, 31)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=14, src=()), x82,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=15, src=()), x90:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (31, 32)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=16, src=()), x90,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=17, src=()), x98:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (32, 33)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=18, src=()), x98,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=19, src=()), x106:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (33, 34)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=20, src=()), x106,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=21, src=()), x114:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (34, 35)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=22, src=()), x114,)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=23, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-35, mask=((0, 1), (35, 39)), contiguous=False),)), src=()),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=24, src=()), x125:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (39, 40)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=25, src=()), x125,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=26, src=()), x133:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (40, 41)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=27, src=()), x133,)),)),)),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=28, src=()), x140:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-41, mask=((0, 1), (41, 47)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=29, src=()), x140,)),)),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 2, 4)), arg=30, src=()), x147:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-47, mask=((0, 1), (47, 53)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=31, src=()), x147,)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=32, src=()), x155:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (53, 54)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=33, src=()), x155,)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=34, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-54, mask=((0, 1), (54, 58)), contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=35, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-58, mask=((0, 1), (58, 62)), contiguous=False),)), src=()),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=36, src=()), x169:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (62, 63)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=37, src=()), x169,)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=38, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-63, mask=((0, 1), (63, 67)), contiguous=False),)), src=()),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=39, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-67, mask=((0, 1), (67, 71)), contiguous=False),)), src=()),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=40, src=()), x183:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (71, 72)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=41, src=()), x183,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=42, src=()), x191:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (72, 73)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=43, src=()), x191,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=44, src=()), x199:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (73, 74)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=45, src=()), x199,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=46, src=()), x207:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (74, 75)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=47, src=()), x207,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=48, src=()), x215:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (75, 76)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=49, src=()), x215,)),)),)),)),)), UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=50, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-76, mask=((0, 1), (76, 80)), contiguous=False),)), src=()),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=51, src=()), x226:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (80, 81)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=52, src=()), x226,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=53, src=()), x234:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (81, 82)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=54, src=()), x234,)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=55, src=()), x243:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (82, 83)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=56, src=()), x243,)),)),)), UOp(Ops.ADD, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 1, 4)), arg=57, src=()), x250:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 0), offset=0, mask=((0, 1), (83, 84)), contiguous=False),)), src=()),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=58, src=()), x250,)),)),)),)),)),)), UOp(Ops.CAST, dtypes.float, arg=None, src=( UOp(Ops.LOAD, dtypes.float, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.imageh((1, 128, 4)), arg=59, src=()), UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 596), strides=(0, 1), offset=-84, mask=((0, 1), (84, 596)), contiguous=False),)), src=()),)),)),)),)),)) # noqa: E501 opts = [Opt(op=OptOps.UPCAST, axis=0, arg=4)] - helper_test_lin(Kernel(ast), opts, failed_platforms=[], validate_device=Device["GPU"]) + helper_test_lin(Kernel(ast), opts, failed_platforms=[], validate_device=Device["CL"]) if __name__ == '__main__': unittest.main() diff --git a/test/external/external_test_image.py b/test/external/external_test_image.py index 1c2cc397f2..4f2e06d903 100644 --- a/test/external/external_test_image.py +++ b/test/external/external_test_image.py @@ -4,7 +4,7 @@ import unittest import numpy as np if 'IMAGE' not in os.environ: os.environ['IMAGE'] = '2' -os.environ['GPU'] = '1' +os.environ['CL'] = '1' os.environ['OPT'] = '2' from tinygrad.tensor import Tensor from tinygrad.nn import Conv2d diff --git a/test/external/external_test_onnx_backend.py b/test/external/external_test_onnx_backend.py index d48154b85d..112ccd797c 100644 --- a/test/external/external_test_onnx_backend.py +++ b/test/external/external_test_onnx_backend.py @@ -193,12 +193,12 @@ backend_test.exclude('test_adam_cpu') backend_test.exclude('test_gradient_of_add_and_mul_cpu') backend_test.exclude('test_gradient_of_add_cpu') -if Device.DEFAULT in ['GPU', 'METAL']: +if Device.DEFAULT in ['CL', 'METAL']: backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu') backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu') backend_test.exclude('test_resize_upsample_sizes_nearest_cpu') -if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "GPU"): +if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "CL"): # numerical inaccuracy backend_test.exclude('test_mish_cpu') backend_test.exclude('test_mish_expanded_cpu') diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py index 584c3af89f..f87206be2b 100644 --- a/test/external/external_test_opt.py +++ b/test/external/external_test_opt.py @@ -34,7 +34,7 @@ from extra.models.efficientnet import EfficientNet from extra.models.resnet import ResNet18 from extra.models.vit import ViT -@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") +@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented") class TestInferenceMinKernels(unittest.TestCase): def setUp(self): self.training_old = Tensor.training @@ -90,7 +90,7 @@ class TestInferenceMinKernels(unittest.TestCase): with CLCache(100): model(inp, 0).realize() -@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") +@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented") class TestOptBinOp(unittest.TestCase): def _test_no_binop_rerun(self, f1, f2=None, allowed=1): a = Tensor.randn(16, 16) @@ -117,7 +117,7 @@ class TestOptBinOp(unittest.TestCase): #def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum()) #def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0]) -@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") +@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented") class TestOptReduceLoop(unittest.TestCase): def test_loop_left(self): a = Tensor.randn(16, 16) @@ -139,7 +139,7 @@ class TestOptReduceLoop(unittest.TestCase): c.realize() assert cache.count == 2, "loop right fusion broken" -@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") +@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented") class TestOptWChild(unittest.TestCase): @unittest.skip("this no longer happens, use realize") def test_unrealized_child(self): @@ -152,7 +152,7 @@ class TestOptWChild(unittest.TestCase): d.realize() assert cache.count == 2, "don't fuse if you have children" -@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") +@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented") class TestOpt(unittest.TestCase): def test_muladd(self): a,b,c = [Tensor.randn(2,2).realize() for _ in range(3)] diff --git a/test/external/fuzz_linearizer.py b/test/external/fuzz_linearizer.py index e5242ac605..19ed23f5d6 100644 --- a/test/external/fuzz_linearizer.py +++ b/test/external/fuzz_linearizer.py @@ -16,7 +16,7 @@ if os.getenv("VALIDATE_HCQ", 0) != 0: try: import extra.qcom_gpu_driver.opencl_ioctl from tinygrad import Device - _, _ = Device["QCOM"], Device["GPU"] + _, _ = Device["QCOM"], Device["CL"] except Exception: pass from tinygrad import Tensor, Device, dtypes @@ -42,9 +42,9 @@ if getenv("VALIDATE_HCQ"): on_linearizer_did_run = extra.nv_gpu_driver.nv_ioctl.collect_last_launch_state compare_states = extra.nv_gpu_driver.nv_ioctl.compare_launch_state elif Device.DEFAULT == "QCOM": - print("VALIDATE_HCQ: Comparing QCOM to GPU") + print("VALIDATE_HCQ: Comparing QCOM to CL") import extra.qcom_gpu_driver.opencl_ioctl - validate_device = Device["GPU"] + validate_device = Device["CL"] on_linearizer_will_run = extra.qcom_gpu_driver.opencl_ioctl.before_launch on_linearizer_did_run = extra.qcom_gpu_driver.opencl_ioctl.collect_last_launch_state compare_states = extra.qcom_gpu_driver.opencl_ioctl.compare_launch_state @@ -302,7 +302,7 @@ if __name__ == "__main__": for i, ast in enumerate(ast_strs[:getenv("FUZZ_N", len(ast_strs))]): if (nth := getenv("FUZZ_NTH", -1)) != -1 and i != nth: continue if getenv("FUZZ_IMAGEONLY") and "dtypes.image" not in ast: continue - if "dtypes.image" in ast and Device.DEFAULT not in {"GPU", "QCOM"}: continue # IMAGE is only for GPU + if "dtypes.image" in ast and Device.DEFAULT not in {"CL", "QCOM"}: continue # IMAGE is only for CL if ast in seen_ast_strs: continue seen_ast_strs.add(ast) diff --git a/test/helpers.py b/test/helpers.py index 4833f425d9..cee64595f3 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -57,8 +57,8 @@ def eval_uop(uop:UOp, inputs:list[tuple[DType, list[Any]]]|None=None): return out_buf.cast(uop.dtype.fmt).tolist()[0] def not_support_multi_device(): - # GPU and CUDA don't support multi device if in CI - return CI and REAL_DEV in ("GPU", "CUDA") + # CL and CUDA don't support multi device if in CI + return CI and REAL_DEV in ("CL", "CUDA") # NOTE: This will open REMOTE if it's the default device REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device) diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index c55aee0ad8..26e0ee760d 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -114,7 +114,7 @@ class TestRealWorld(unittest.TestCase): helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.07, 93) - @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "GPU"}, "slow") + @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow") def test_train_cifar(self): with Tensor.train(): model = SpeedyResNet(Tensor.ones((12,3,2,2))) diff --git a/test/models/test_train.py b/test/models/test_train.py index 605e6f6de1..972c491923 100644 --- a/test/models/test_train.py +++ b/test/models/test_train.py @@ -27,7 +27,7 @@ def train_one_step(model,X,Y): print("done in %.2f ms" % (et*1000.)) def check_gc(): - if Device.DEFAULT == "GPU": + if Device.DEFAULT == "CL": from extra.introspection import print_objects assert print_objects() == 0 diff --git a/test/opt/test_kernel_opts.py b/test/opt/test_kernel_opts.py index 25951a01af..c0c8865146 100644 --- a/test/opt/test_kernel_opts.py +++ b/test/opt/test_kernel_opts.py @@ -93,7 +93,7 @@ class TestKernelOpts(unittest.TestCase): a = Tensor.rand(8, N, 8, N) r = a.sum(axis=(1,3)) helper_linearizer_opt(r, [ - # openCL / GPU=1 is 256 max threads + # openCL / CL=1 is 256 max threads [Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)], [Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce. [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)], diff --git a/test/speed/external_test_copy_speed.py b/test/speed/external_test_copy_speed.py index 351c7d993a..359a2499ac 100644 --- a/test/speed/external_test_copy_speed.py +++ b/test/speed/external_test_copy_speed.py @@ -77,9 +77,9 @@ class TestCopySpeed(unittest.TestCase): np.testing.assert_equal(t.numpy(), x.numpy()) @unittest.skipIf(CI, "CI doesn't have 6 GPUs") - @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU") + @unittest.skipIf(Device.DEFAULT != "CL", "only test this on CL") def testCopyCPUto6GPUs(self): - from tinygrad.runtime.ops_gpu import CLDevice + from tinygrad.runtime.ops_cl import CLDevice if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs") t = Tensor.ones(N, N, device="CPU").contiguous().realize() print(f"buffer: {t.nbytes()*1e-9:.2f} GB") @@ -87,8 +87,8 @@ class TestCopySpeed(unittest.TestCase): with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"): with Timing("queue: "): for g in range(6): - t.to(f"gpu:{g}").realize() - Device["gpu"].synchronize() + t.to(f"CL:{g}").realize() + Device["CL"].synchronize() if __name__ == '__main__': unittest.main() diff --git a/test/test_dtype.py b/test/test_dtype.py index 3f007783a1..31e4472bf5 100644 --- a/test/test_dtype.py +++ b/test/test_dtype.py @@ -424,7 +424,7 @@ class TestDtypeUsage(unittest.TestCase): class TestOpsBFloat16(unittest.TestCase): def test_cast(self): # TODO: helper_test_op breaks in unrelated part - # TODO: wrong output with GPU=1 on mac + # TODO: wrong output with CL=1 on mac data = [60000.0, 70000.0, 80000.0] np.testing.assert_allclose(Tensor(data).cast("bfloat16").numpy(), torch.tensor(data).type(torch.bfloat16).float().numpy()) diff --git a/test/test_image_dtype.py b/test/test_image_dtype.py index 08d2c04c32..6adab73e51 100644 --- a/test/test_image_dtype.py +++ b/test/test_image_dtype.py @@ -7,7 +7,7 @@ from tinygrad.engine.realize import lower_schedule from tinygrad.helpers import prod, unwrap from test.helpers import REAL_DEV -IMAGE_SUPPORTED_DEVICES = ("QCOM", "GPU") +IMAGE_SUPPORTED_DEVICES = ("QCOM", "CL") @unittest.skipUnless(REAL_DEV in IMAGE_SUPPORTED_DEVICES, "Images not supported") class TestImageCopy(unittest.TestCase): diff --git a/test/test_ops.py b/test/test_ops.py index eb22cf21bb..dc3952d519 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1304,7 +1304,7 @@ class TestOps(unittest.TestCase): np.arange(64,128,dtype=np.float32).reshape(8,8)]) def test_small_gemm_eye(self): helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)]) - @unittest.skipIf(CI and Device.DEFAULT in ["NV", "GPU", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE + @unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE") def test_gemm_fp16(self): helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3) diff --git a/test/test_opts.py b/test/test_opts.py index 2c0de53199..4a6310ef32 100644 --- a/test/test_opts.py +++ b/test/test_opts.py @@ -13,7 +13,7 @@ class TestOpts(unittest.TestCase): out = (a+b).contiguous(arg=opts) s = out.schedule() self.assertEqual(s[-1].ast.arg.opts_to_apply, opts) - if Device.DEFAULT in {"CPU", "GPU", "METAL"} and not CPU_LLVM: + if Device.DEFAULT in {"CPU", "CL", "METAL"} and not CPU_LLVM: prg = get_program(s[-1].ast) self.assertIn('float4', prg.src) diff --git a/test/test_schedule.py b/test/test_schedule.py index a895ed1a0d..9e00835460 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -1654,7 +1654,7 @@ class TestSchedule(unittest.TestCase): constv = Tensor.empty(2, 2).uop.const_like(10).contiguous() check_schedule(constv, 1) - @unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU") + @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL") def test_image_matmul(self): with Context(IMAGE=2): x = Tensor.randn((9, 9)).realize() diff --git a/test/test_tiny.py b/test/test_tiny.py index bf0f903ff2..78d1517522 100644 --- a/test/test_tiny.py +++ b/test/test_tiny.py @@ -137,7 +137,7 @@ class TestTiny(unittest.TestCase): # *** image *** - @unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU") + @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL") def test_image(self): with Context(IMAGE=2): self.test_gemm(N=4, out_dtype=dtypes.imagef((4, 1, 4))) diff --git a/test/test_uops.py b/test/test_uops.py index 64b9abe2de..4a7ed30880 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -513,7 +513,7 @@ class TestUOpStr(unittest.TestCase): assert str(eval(str(vec))) == str(vec) def test_device_arg(self): - device = UOp(Ops.DEVICE, arg="GPU") + device = UOp(Ops.DEVICE, arg="CL") assert str(eval(str(device))) == str(device) def test_reduceop_arg(self): diff --git a/test/unit/test_device.py b/test/unit/test_device.py index 1c1b9f7997..8ab43a17f0 100644 --- a/test/unit/test_device.py +++ b/test/unit/test_device.py @@ -9,12 +9,12 @@ class TestDevice(unittest.TestCase): self.assertEqual(Device.canonicalize(None), Device.DEFAULT) self.assertEqual(Device.canonicalize("CPU"), "CPU") self.assertEqual(Device.canonicalize("cpu"), "CPU") - self.assertEqual(Device.canonicalize("GPU"), "GPU") - self.assertEqual(Device.canonicalize("GPU:0"), "GPU") - self.assertEqual(Device.canonicalize("gpu:0"), "GPU") - self.assertEqual(Device.canonicalize("GPU:1"), "GPU:1") - self.assertEqual(Device.canonicalize("gpu:1"), "GPU:1") - self.assertEqual(Device.canonicalize("GPU:2"), "GPU:2") + self.assertEqual(Device.canonicalize("CL"), "CL") + self.assertEqual(Device.canonicalize("CL:0"), "CL") + self.assertEqual(Device.canonicalize("cl:0"), "CL") + self.assertEqual(Device.canonicalize("CL:1"), "CL:1") + self.assertEqual(Device.canonicalize("cl:1"), "CL:1") + self.assertEqual(Device.canonicalize("CL:2"), "CL:2") self.assertEqual(Device.canonicalize("disk:/dev/shm/test"), "DISK:/dev/shm/test") self.assertEqual(Device.canonicalize("disk:000.txt"), "DISK:000.txt") diff --git a/test/unit/test_indexing.py b/test/unit/test_indexing.py index da5d61944c..c9d6d7c7da 100644 --- a/test/unit/test_indexing.py +++ b/test/unit/test_indexing.py @@ -181,7 +181,7 @@ class TestIndexing(unittest.TestCase): # self.assertRaises(TypeError, delitem) # TODO: LLVM is quite fast, why are other compiled backends slow? - @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "GPU", "METAL", "NV", "AMD"], "slow") + @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow") def test_advancedindex(self): # integer array indexing diff --git a/test/unit/test_simplify_valid_idx.py b/test/unit/test_simplify_valid_idx.py index d75c872209..359f7d108f 100644 --- a/test/unit/test_simplify_valid_idx.py +++ b/test/unit/test_simplify_valid_idx.py @@ -359,7 +359,7 @@ class TestImageSimplification(unittest.TestCase): self.check(load, None, "((gidx*3)+-1438)", "0") def test_simplify2(self): - # from GPU=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d + # from CL=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d lidx = Special("lidx", 4) valid = (lidx<3) & (lidx<1).ne(True) idx = ((lidx+1)%2, (lidx+1)//2-1) diff --git a/tinygrad/device.py b/tinygrad/device.py index 483dc9fe35..a69c5316d7 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -11,7 +11,7 @@ from tinygrad.renderer import Renderer # **************** Device **************** -ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CPU", "DSP", "WEBGPU"] +ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "CL", "CPU", "DSP", "WEBGPU"] class _Device: def __init__(self) -> None: self._devices = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")] @@ -336,11 +336,11 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool: # CI CUDA architecture is sm_35 but we need at least sm_70 to run fp16 ALUs # PYTHON supports half memoryview in 3.12+ https://github.com/python/cpython/issues/90751 if dtype == dtypes.half: - if device == "GPU": return not CI and not OSX + if device == "CL": return not CI and not OSX if device in ["CUDA", "NV"]: return not CI if device == "CPU" and CPU_LLVM: return OSX if device == "PYTHON": return sys.version_info >= (3, 12) - if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "GPU") + if dtype == dtypes.float64: return device != "METAL" and not (OSX and device == "CL") return True if PROFILE: diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index b268ce7216..20e3b8ec02 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -242,7 +242,7 @@ class ClangRenderer(CStyleLanguage): return defines + "\n" + self._render_body(function_name, kernel, bufs, uops, prefix) + "\n" + self._render_entry(function_name, bufs) class OpenCLRenderer(CStyleLanguage): - device = "GPU" + device = "CL" # language options kernel_typedef = "__kernel void" @@ -271,7 +271,7 @@ class OpenCLRenderer(CStyleLanguage): return super().render_kernel(function_name, kernel, bufs, uops, prefix) class IntelRenderer(OpenCLRenderer): - device, suffix, kernel_typedef = "GPU", "INTEL", "__attribute__((intel_reqd_sub_group_size(8)))\n" + "__kernel void" + device, suffix, kernel_typedef = "CL", "INTEL", "__attribute__((intel_reqd_sub_group_size(8)))\n" + "__kernel void" tensor_cores = tc.intel string_rewrite = PatternMatcher([ diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_cl.py similarity index 100% rename from tinygrad/runtime/ops_gpu.py rename to tinygrad/runtime/ops_cl.py diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index f3e7dc899f..787d349ee9 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -7,7 +7,7 @@ from tinygrad.device import BufferSpec from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface from tinygrad.runtime.autogen import kgsl, adreno -from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice +from tinygrad.runtime.ops_cl import CLCompiler, CLDevice from tinygrad.renderer.cstyle import QCOMRenderer from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import