diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 6c2d54be9e..d1aadc9498 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -64,7 +64,7 @@ jobs: run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - name: Test AMX tensor cores run: | - DEBUG=2 CLANG=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded + DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded - name: Run Tensor Core GEMM (float) run: DEBUG=2 python3.11 extra/gemm/simple_matmul.py | tee matmul.txt diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4df40f654f..44a1de2d3b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -81,7 +81,7 @@ jobs: run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())" - name: Compile EfficientNet to C and test it run: | - CLANG=1 PYTHONPATH="." python examples/compile_efficientnet.py > recognize.c + CPU=1 PYTHONPATH="." python examples/compile_efficientnet.py > recognize.c clang -O2 recognize.c -lm -o recognize cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock @@ -355,13 +355,13 @@ jobs: llvm: 'true' - name: Test ONNX (GPU) run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - - name: Test ONNX (CLANG) - run: CLANG=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 + - name: Test ONNX (CPU) + run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Test ONNX (LLVM) run: LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20 - name: Run CLOUD=1 Test run: | - CLOUDDEV=CLANG CLOUD=1 python3 test/test_tiny.py + CLOUDDEV=CPU CLOUD=1 python3 test/test_tiny.py CLOUDDEV=GPU CLOUD=1 python3 test/test_tiny.py CLOUDDEV=GPU IMAGE=2 CLOUD=1 python3 test/test_tiny.py - name: Test Optimization Helpers @@ -378,7 +378,7 @@ jobs: uses: ./.github/actions/process-replay testmodels: - name: Models (llvm+clang+gpu) + name: Models (llvm+cpu+gpu) runs-on: ubuntu-22.04 timeout-minutes: 10 steps: @@ -395,8 +395,8 @@ jobs: run: LLVM=1 python -m pytest -n=auto test/models --durations=20 - name: Test models (gpu) run: GPU=1 python -m pytest -n=auto test/models --durations=20 - - name: Test models (clang) - run: CLANG=1 python -m pytest -n=auto test/models --durations=20 + - name: Test models (cpu) + run: CPU=1 python -m pytest -n=auto test/models --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay @@ -431,8 +431,8 @@ jobs: run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py - name: Test LLVM=1 DEVECTORIZE=0 run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" - - name: Test CLANG=1 DEVECTORIZE=0 - run: CLANG=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" + - name: Test CPU=1 DEVECTORIZE=0 + run: CPU=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure" testwebgpu: name: Linux (WebGPU) @@ -464,7 +464,7 @@ jobs: strategy: fail-fast: false matrix: - backend: [llvm, clang, gpu, ptx, amd, nv] #, triton] + backend: [llvm, cpu, gpu, ptx, amd, nv] #, triton] name: Linux (${{ matrix.backend }}) runs-on: ubuntu-22.04 @@ -482,10 +482,10 @@ jobs: amd: ${{ matrix.backend == 'amd' && 'true' }} cuda: ${{ (matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv') && 'true' }} - name: Set env - run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nPTX=1\nMOCKGPU=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nNV=1\nMOCKGPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV + run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nPTX=1\nMOCKGPU=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nNV=1\nMOCKGPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV - name: Check Device.DEFAULT and print some source run: | - PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU','AMD','NV'], Device.DEFAULT" + PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CPU','CUDA','GPU','AMD','NV'], Device.DEFAULT" DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Run pytest (not cuda or amd) if: matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd' && matrix.backend != 'nv' @@ -582,7 +582,7 @@ jobs: strategy: fail-fast: false matrix: - backend: [metal, llvm, clang] + backend: [metal, llvm, cpu] name: MacOS (${{ matrix.backend }}) runs-on: macos-15 timeout-minutes: 10 @@ -596,7 +596,7 @@ jobs: deps: testing_minimal llvm: ${{ matrix.backend == 'llvm' && 'true' }} - name: Set env - run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'metal' && 'METAL=1\nJIT=2'}}" >> $GITHUB_ENV + run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1\nJIT=2'}}" >> $GITHUB_ENV - name: Check Device.DEFAULT and print some source run: | python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT" @@ -612,7 +612,7 @@ jobs: strategy: fail-fast: false matrix: - backend: [llvm, clang] + backend: [llvm, cpu] name: Windows (${{ matrix.backend }}) runs-on: windows-latest @@ -627,7 +627,7 @@ jobs: deps: testing_unit - name: Set env shell: bash - run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1'}}" >> $GITHUB_ENV + run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1'}}" >> $GITHUB_ENV - name: Run unit tests if: matrix.backend=='llvm' run: python -m pytest -n=auto test/unit/ --ignore=test/unit/test_disk_tensor.py --ignore=test/unit/test_elf.py --ignore=test/unit/test_tar.py diff --git a/README.md b/README.md index 226c3ddfab..7b0dc08564 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers tinygrad already supports numerous accelerators, including: - [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py) -- [x] [CLANG (C Code)](tinygrad/runtime/ops_clang.py) +- [x] [CPU (C Code)](tinygrad/runtime/ops_cpu.py) - [x] [LLVM](tinygrad/runtime/ops_llvm.py) - [x] [METAL](tinygrad/runtime/ops_metal.py) - [x] [CUDA](tinygrad/runtime/ops_cuda.py) diff --git a/docs/abstractions2.py b/docs/abstractions2.py index 4d0ceacb7a..1998cb3383 100644 --- a/docs/abstractions2.py +++ b/docs/abstractions2.py @@ -7,7 +7,7 @@ print("******** first, the runtime ***********") -from tinygrad.runtime.ops_clang import ClangJITCompiler, MallocAllocator, CPUProgram +from tinygrad.runtime.ops_cpu import ClangJITCompiler, MallocAllocator, CPUProgram # allocate some buffers out = MallocAllocator.alloc(4) @@ -34,7 +34,7 @@ assert val == 5 print("******** second, the Device ***********") -DEVICE = "CLANG" # NOTE: you can change this! +DEVICE = "CPU" # NOTE: you can change this! import struct from tinygrad.dtype import dtypes @@ -90,7 +90,7 @@ out = a.alu(Ops.ADD, b) # schedule the computation as a list of kernels sched, _, becomes_map = create_schedule_with_vars(out.sink()) -for si in sched: print(si.ast.op) # NOTE: the first two convert it to CLANG +for si in sched: print(si.ast.op) # NOTE: the first two convert it to CPU # NOTE: UOps are no longer mutable, the scheduler gives you a map to lookup which BUFFER the result was written to out = becomes_map[out] diff --git a/docs/developer/runtime.md b/docs/developer/runtime.md index 0c8a9d5ed9..b246dfeb0e 100644 --- a/docs/developer/runtime.md +++ b/docs/developer/runtime.md @@ -38,7 +38,7 @@ The `Allocator` class is responsible for managing memory on the device. There is The `Program` class is created for each loaded program. It is responsible for executing the program on the device. As an example, here is a `CPUProgram` implementation which loads program and runs it. -::: tinygrad.runtime.ops_clang.CPUProgram +::: tinygrad.runtime.ops_cpu.CPUProgram options: members: true diff --git a/docs/env_vars.md b/docs/env_vars.md index 4e4d3696de..2ac4066141 100644 --- a/docs/env_vars.md +++ b/docs/env_vars.md @@ -31,13 +31,13 @@ These control the behavior of core tinygrad even when used as a library. Variable | Possible Value(s) | Description ---|---|--- DEBUG | [1-6] | enable debugging output, with 4 you get operations, timings, speed, generated code and more -GPU | [1] | enable the GPU backend +GPU | [1] | enable the GPU (OpenCL) backend CUDA | [1] | enable CUDA backend AMD | [1] | enable AMD backend NV | [1] | enable NV backend METAL | [1] | enable Metal backend (for Mac M1 and after) METAL_XCODE | [1] | enable Metal using macOS Xcode SDK -CLANG | [1] | enable Clang backend +CPU | [1] | enable CPU (Clang) backend LLVM | [1] | enable LLVM backend BEAM | [#] | number of beams in kernel beam search DEFAULT_FLOAT | [HALF, ...]| specify the default float dtype (FLOAT32, HALF, BFLOAT16, FLOAT64, ...), default to FLOAT32 diff --git a/docs/mnist.md b/docs/mnist.md index 8aae08f241..ce55890eb9 100644 --- a/docs/mnist.md +++ b/docs/mnist.md @@ -17,7 +17,7 @@ from tinygrad import Device print(Device.DEFAULT) ``` -You will see `CUDA` here on a GPU instance, or `CLANG` here on a CPU instance. +You will see `CUDA` here on a GPU instance, or `CPU` here on a CPU instance. ## A simple model diff --git a/docs/runtime.md b/docs/runtime.md index 93ee07c46f..af78854b03 100644 --- a/docs/runtime.md +++ b/docs/runtime.md @@ -1,6 +1,6 @@ # Runtimes -tinygrad supports various runtimes, enabling your code to scale across a wide range of devices. The default runtime can be automatically selected based on the available hardware, or you can force a specific runtime to be default using environment variables (e.g., `CLANG=1`). +tinygrad supports various runtimes, enabling your code to scale across a wide range of devices. The default runtime can be automatically selected based on the available hardware, or you can force a specific runtime to be default using environment variables (e.g., `CPU=1`). | Runtime | Description | Requirements | |---------|-------------|--------------| @@ -10,6 +10,6 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra | [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support | | [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support | | [GPU (OpenCL)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_gpu.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device | -| [CLANG (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_clang.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` | +| [CPU (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` | | [LLVM (LLVM IR)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_llvm.py) | Runs on CPU using the LLVM compiler infrastructure | llvm libraries installed and findable | | [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | Dawn library installed and findable. Download binaries [here](https://github.com/wpmed92/pydawn/releases/tag/v0.1.6). | diff --git a/examples/compile_efficientnet.py b/examples/compile_efficientnet.py index 1a27c159c4..3690e7ed33 100644 --- a/examples/compile_efficientnet.py +++ b/examples/compile_efficientnet.py @@ -15,9 +15,9 @@ if __name__ == "__main__": if getenv("WEBGPU"): safe_save(get_state_dict(model), (dirname / "net.safetensors").as_posix()) load_state_dict(model, safe_load(str(dirname / "net.safetensors"))) - mode = "clang" if getenv("CLANG", "") != "" else "webgpu" if getenv("WEBGPU", "") != "" else "" + mode = "clang" if getenv("CPU", "") != "" else "webgpu" if getenv("WEBGPU", "") != "" else "" prg, inp_sizes, out_sizes, state = export_model(model, mode, Tensor.randn(1,3,224,224)) - if getenv("CLANG", "") == "": + if getenv("CPU", "") == "": ext = "js" if getenv("WEBGPU", "") != "" else "json" with open(dirname / f"net.{ext}", "w") as text_file: text_file.write(prg) @@ -68,6 +68,6 @@ if __name__ == "__main__": else printf("%s\\n", lbls[best_idx]); }""") - # CLANG=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && DEBUG=1 time ./recognize docs/showcase/stable_diffusion_by_tinygrad.jpg + # CPU=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && DEBUG=1 time ./recognize docs/showcase/stable_diffusion_by_tinygrad.jpg # category : 281 (tabby, tabby cat) with 9.452788 print('\n'.join(cprog)) diff --git a/examples/compile_tensorflow.py b/examples/compile_tensorflow.py index 7733934880..c906592282 100644 --- a/examples/compile_tensorflow.py +++ b/examples/compile_tensorflow.py @@ -1,7 +1,7 @@ # An example to compile a small Tensorflow model to extremely portable C code import os, sys -os.environ["CLANG"] = '1' +os.environ["CPU"] = '1' os.environ["JIT"] = '2' import numpy as np diff --git a/examples/llm.c/export.py b/examples/llm.c/export.py index d257b81778..b041aba10e 100755 --- a/examples/llm.c/export.py +++ b/examples/llm.c/export.py @@ -2,7 +2,7 @@ import os if "NOOPT" not in os.environ: os.environ["NOOPT"] = "1" from tinygrad import Device, nn, Tensor, dtypes, Variable -Device.DEFAULT = "CLANG" +Device.DEFAULT = "CPU" from train_gpt2 import GPT, GPTConfig from tinygrad.helpers import dedup, to_function_name, flatten, getenv, GlobalCounters, ansilen, to_function_name from tinygrad.engine.realize import get_kernel, run_schedule @@ -43,9 +43,9 @@ if __name__ == "__main__": ast_dedup = dedup([si.ast for si in sched if si.ast.op is Ops.SINK]) srcs = {} for ast in ast_dedup: - k = get_kernel(Device["CLANG"].renderer, ast) + k = get_kernel(Device["CPU"].renderer, ast) k.linearize() - src = Device["CLANG"].renderer.render(to_function_name(k.name), k.uops) + src = Device["CPU"].renderer.render(to_function_name(k.name), k.uops) srcs[ast] = (k.name, src) print("functions:", len(srcs)) used_buffers = dedup(flatten([si.bufs for si in sched])) diff --git a/examples/mlperf/dataloader.py b/examples/mlperf/dataloader.py index 2bde3fe5f1..833915f190 100644 --- a/examples/mlperf/dataloader.py +++ b/examples/mlperf/dataloader.py @@ -170,13 +170,13 @@ def batch_load_resnet(batch_size=64, val=False, shuffle=True, seed=None, pad_fir def process_batch_bert(data: List[dict]) -> dict[str, Tensor]: return { - "input_ids": Tensor(np.concatenate([s["input_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"), - "input_mask": Tensor(np.concatenate([s["input_mask"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"), - "segment_ids": Tensor(np.concatenate([s["segment_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"), - "masked_lm_positions": Tensor(np.concatenate([s["masked_lm_positions"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"), - "masked_lm_ids": Tensor(np.concatenate([s["masked_lm_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"), - "masked_lm_weights": Tensor(np.concatenate([s["masked_lm_weights"] for s in data], axis=0), dtype=dtypes.float32, device="CLANG"), - "next_sentence_labels": Tensor(np.concatenate([s["next_sentence_labels"] for s in data], axis=0), dtype=dtypes.int32, device="CLANG"), + "input_ids": Tensor(np.concatenate([s["input_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"), + "input_mask": Tensor(np.concatenate([s["input_mask"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"), + "segment_ids": Tensor(np.concatenate([s["segment_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"), + "masked_lm_positions": Tensor(np.concatenate([s["masked_lm_positions"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"), + "masked_lm_ids": Tensor(np.concatenate([s["masked_lm_ids"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"), + "masked_lm_weights": Tensor(np.concatenate([s["masked_lm_weights"] for s in data], axis=0), dtype=dtypes.float32, device="CPU"), + "next_sentence_labels": Tensor(np.concatenate([s["next_sentence_labels"] for s in data], axis=0), dtype=dtypes.int32, device="CPU"), } def load_file(file: str): diff --git a/examples/mlperf/helpers.py b/examples/mlperf/helpers.py index 0c01db2a9a..07469f2757 100644 --- a/examples/mlperf/helpers.py +++ b/examples/mlperf/helpers.py @@ -222,11 +222,11 @@ def get_mlperf_bert_model(): def get_fake_data_bert(BS:int): return { - "input_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CLANG"), - "input_mask": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CLANG"), - "segment_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CLANG"), - "masked_lm_positions": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CLANG"), - "masked_lm_ids": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CLANG"), - "masked_lm_weights": Tensor.empty((BS, 76), dtype=dtypes.float32, device="CLANG"), - "next_sentence_labels": Tensor.empty((BS, 1), dtype=dtypes.int32, device="CLANG"), + "input_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"), + "input_mask": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"), + "segment_ids": Tensor.empty((BS, 512), dtype=dtypes.int32, device="CPU"), + "masked_lm_positions": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"), + "masked_lm_ids": Tensor.empty((BS, 76), dtype=dtypes.int32, device="CPU"), + "masked_lm_weights": Tensor.empty((BS, 76), dtype=dtypes.float32, device="CPU"), + "next_sentence_labels": Tensor.empty((BS, 1), dtype=dtypes.int32, device="CPU"), } diff --git a/extra/backends/clang_graph.py b/extra/backends/clang_graph.py index 48fdc439d2..5e558a3ef7 100644 --- a/extra/backends/clang_graph.py +++ b/extra/backends/clang_graph.py @@ -5,7 +5,7 @@ from tinygrad.engine.jit import GraphRunner, GraphException from tinygrad.device import Buffer, Device from tinygrad.engine.realize import ExecItem, CompiledRunner from tinygrad.ops import Variable -from tinygrad.runtime.ops_clang import ClangProgram +from tinygrad.runtime.ops_cpu import ClangProgram from tinygrad.renderer.cstyle import ClangRenderer render_dtype = ClangRenderer().render_dtype @@ -30,7 +30,7 @@ class ClangGraph(GraphRunner): code.append(f" {cast(CompiledRunner, ji.prg).p.function_name}({','.join(args)});") code.append("}") if DEBUG >= 4: print("\n".join(code)) - compiler = Device["CLANG"].compiler + compiler = Device["CPU"].compiler assert compiler is not None self._prg = ClangProgram("batched", compiler.compile(prgs+"\n"+"\n".join(code))) # no point in caching the pointers diff --git a/extra/export_model.py b/extra/export_model.py index 51b48c6f28..edd5b01f49 100644 --- a/extra/export_model.py +++ b/extra/export_model.py @@ -8,7 +8,7 @@ from tinygrad.helpers import Context from tinygrad.dtype import dtypes import json -EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CLANG", "CUDA", "GPU"] +EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "GPU"] def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str],List[Tuple[str,List[str],List[int]]],Dict[str,Tuple[int,DType,int]],Dict[str,Tensor]]: functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0 @@ -191,7 +191,7 @@ export default {exported_name}; """ def export_model(model, target:str, *inputs, model_name: Optional[str] = None): - assert Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, "only WEBGPU, CLANG, CUDA, GPU, METAL are supported" + assert Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, "only WEBGPU, CPU, CUDA, GPU, METAL are supported" with Context(JIT=2): run,special_names = jit_model(model, *inputs) functions, statements, bufs, bufs_to_save = compile_net(run, special_names) state = get_state_dict(model) diff --git a/extra/gemm/tvm_gemm.py b/extra/gemm/tvm_gemm.py index d09dd36c35..03fe0f7894 100644 --- a/extra/gemm/tvm_gemm.py +++ b/extra/gemm/tvm_gemm.py @@ -32,8 +32,8 @@ import os from tinygrad.tensor import Tensor # define the compute -A = Tensor.rand(M, K, device="clang") -B = Tensor.rand(K, N, device="clang") +A = Tensor.rand(M, K, device="CPU") +B = Tensor.rand(K, N, device="CPU") C = (A.reshape(M, 1, K) * B.permute(1,0).reshape(1, N, K)).sum(axis=2) sched = C.schedule() @@ -42,6 +42,6 @@ from tinygrad.device import CompilerOptions lin = Kernel(sched[-1].ast, CompilerOptions(has_local=False, supports_float4=False)) #lin.hand_coded_optimizations() lin.linearize() -from tinygrad.runtime.ops_clang import renderer +from tinygrad.runtime.ops_cpu import renderer src = renderer("mmult", lin.uops) print(src) diff --git a/test/external/external_model_benchmark.py b/test/external/external_model_benchmark.py index 4c0b720df2..6653839ce0 100644 --- a/test/external/external_model_benchmark.py +++ b/test/external/external_model_benchmark.py @@ -138,7 +138,7 @@ def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5): else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}") if __name__ == "__main__": - devices = [Device.DEFAULT] if getenv("NOCLANG") else [Device.DEFAULT, "CLANG"] + devices = [Device.DEFAULT] if getenv("NOCLANG") else [Device.DEFAULT, "CPU"] if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), devices, True) else: for m in MODELS: benchmark_model(m, devices, True) diff --git a/test/external/external_multi_gpu.py b/test/external/external_multi_gpu.py index 00c02b41cb..e5dd836c88 100644 --- a/test/external/external_multi_gpu.py +++ b/test/external/external_multi_gpu.py @@ -19,8 +19,8 @@ if __name__ == "__main__": with Timing("GPU initial sync: "): sync() with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"): - c0 = (Tensor.ones(sz, device="clang")/2).realize() - c1 = (Tensor.ones(sz, device="clang")/4).realize() + c0 = (Tensor.ones(sz, device="CPU")/2).realize() + c1 = (Tensor.ones(sz, device="CPU")/4).realize() print(c0.lazydata.base.realized) print(c1.lazydata.base.realized) diff --git a/test/external/external_test_example.py b/test/external/external_test_example.py index d6149d1467..1dc0b7b547 100644 --- a/test/external/external_test_example.py +++ b/test/external/external_test_example.py @@ -23,10 +23,10 @@ def multidevice_test(fxn): class TestExample(unittest.TestCase): @multidevice_test - def test_convert_to_clang(self, device): + def test_convert_to_cpu(self, device): a = Tensor([[1,2],[3,4]], device=device) assert a.numpy().shape == (2,2) - b = a.to("CLANG") + b = a.to("CPU") assert b.numpy().shape == (2,2) @multidevice_test diff --git a/test/imported/test_indexing.py b/test/imported/test_indexing.py index 21c5b251cb..271aeb586b 100644 --- a/test/imported/test_indexing.py +++ b/test/imported/test_indexing.py @@ -181,7 +181,7 @@ class TestIndexing(unittest.TestCase): # self.assertRaises(TypeError, delitem) # TODO: LLVM is quite fast, why are other compiled backends slow? - @unittest.skipIf(CI and Device.DEFAULT in ["CLANG", "GPU", "METAL", "NV", "AMD"], "slow") + @unittest.skipIf(CI and Device.DEFAULT in ["CPU", "GPU", "METAL", "NV", "AMD"], "slow") def test_advancedindex(self): # integer array indexing diff --git a/test/models/test_mnist.py b/test/models/test_mnist.py index e51555f507..2f5e939862 100644 --- a/test/models/test_mnist.py +++ b/test/models/test_mnist.py @@ -49,7 +49,7 @@ class TinyConvNet: x = x.reshape(shape=[x.shape[0], -1]) return x.dot(self.l1) -@unittest.skipIf(CI and Device.DEFAULT == "CLANG", "slow") +@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow") class TestMNIST(unittest.TestCase): def test_sgd_onestep(self): np.random.seed(1337) diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index 5fa29defc9..17a725b3bf 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -48,7 +48,7 @@ class TestRealWorld(unittest.TestCase): def tearDown(self): dtypes.default_float = self.old_float - @unittest.skipIf(CI and Device.DEFAULT == "CLANG", "slow, covered by METAL") + @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow, covered by METAL") @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16") def test_stable_diffusion(self): params = unet_params @@ -95,7 +95,7 @@ class TestRealWorld(unittest.TestCase): with Context(JIT=0): return model(t, v).realize() helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23 if CI else 0.9, 137 if CI else 396, all_jitted=True) - @unittest.skipIf(CI and Device.DEFAULT == "CLANG", "slow") + @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow") def test_train_mnist(self): from examples.beautiful_mnist import Model with Tensor.train(): @@ -113,7 +113,7 @@ class TestRealWorld(unittest.TestCase): helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.07, 92) - @unittest.skipIf(CI and Device.DEFAULT in {"CLANG", "GPU", "LLVM"}, "slow") + @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "GPU", "LLVM"}, "slow") def test_train_cifar(self): with Tensor.train(): model = SpeedyResNet(Tensor.ones((12,3,2,2))) diff --git a/test/models/test_whisper.py b/test/models/test_whisper.py index 458e8b69c5..f1696fd490 100644 --- a/test/models/test_whisper.py +++ b/test/models/test_whisper.py @@ -16,7 +16,7 @@ TRANSCRIPTION_2 = "a slightly longer audio file so that we can test batch transc TEST_FILE_3_URL = 'https://homepage.ntu.edu.tw/~karchung/miniconversations/mc45.mp3' TRANSCRIPTION_3 = "Just lie back and relax. Is the level of pressure about right? Yes, it's fine, and I'd like conditioner please. Sure. I'm going to start the second lathering now. Would you like some Q-tips? How'd you like it cut? I'd like my bangs and the back trimmed, and I'd like the rest thinned out a bit and layered. Where would you like the part? On the left, right about here. Here, have a look. What do you think? It's fine. Here's a thousand anti-dollars. It's 30-ant extra for the rants. Here's your change and receipt. Thank you, and please come again. So how do you like it? It could have been worse, but you'll notice that I didn't ask her for her card. Hmm, yeah. Maybe you can try that place over there next time." # noqa: E501 -@unittest.skipIf(CI and Device.DEFAULT in ["CLANG"], "slow") +@unittest.skipIf(CI and Device.DEFAULT in ["CPU"], "slow") @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16 support") class TestWhisper(unittest.TestCase): @classmethod diff --git a/test/test_copy_speed.py b/test/test_copy_speed.py index 0c3a85fc98..5d0cc69c26 100644 --- a/test/test_copy_speed.py +++ b/test/test_copy_speed.py @@ -24,7 +24,7 @@ class TestCopySpeed(unittest.TestCase): s.unlink() def testCopyCPUtoDefault(self): - t = Tensor.ones(N, N, device="clang").contiguous().realize() + t = Tensor.ones(N, N, device="CPU").contiguous().realize() print(f"buffer: {t.nbytes()*1e-9:.2f} GB") for _ in range(3): with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): @@ -35,7 +35,7 @@ class TestCopySpeed(unittest.TestCase): def testCopyCPUtoDefaultFresh(self): print("fresh copy") for _ in range(3): - t = Tensor.ones(N, N, device="clang").contiguous().realize() + t = Tensor.ones(N, N, device="CPU").contiguous().realize() with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): # noqa: F821 with Timing("queue: "): t.to(Device.DEFAULT).realize() @@ -47,14 +47,14 @@ class TestCopySpeed(unittest.TestCase): print(f"buffer: {t.nbytes()*1e-9:.2f} GB") for _ in range(3): with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): - t.to('clang').realize() + t.to('CPU').realize() @unittest.skipIf(CI, "CI doesn't have 6 GPUs") @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU") def testCopyCPUto6GPUs(self): from tinygrad.runtime.ops_gpu import CLDevice if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs") - t = Tensor.ones(N, N, device="clang").contiguous().realize() + t = Tensor.ones(N, N, device="CPU").contiguous().realize() print(f"buffer: {t.nbytes()*1e-9:.2f} GB") for _ in range(3): with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"): diff --git a/test/test_fuzz_shape_ops.py b/test/test_fuzz_shape_ops.py index 2bf7889a40..b90d98cb03 100644 --- a/test/test_fuzz_shape_ops.py +++ b/test/test_fuzz_shape_ops.py @@ -38,7 +38,7 @@ def apply(tor, ten, tor_fn, ten_fn=None): except: ten, ok = None, not ok # noqa: E722 return tor, ten, ok -@unittest.skipIf(CI and Device.DEFAULT in ("CLANG", "NV"), "slow") +@unittest.skipIf(CI and Device.DEFAULT in ("CPU", "NV"), "slow") class TestShapeOps(unittest.TestCase): @settings.get_profile(__file__) @given(st_shape(), st_int32, st.one_of(st_int32, st.lists(st_int32))) diff --git a/test/test_jit.py b/test/test_jit.py index 7abb13100f..7c6bf77206 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -22,7 +22,7 @@ def _simple_test(add, extract=lambda x: x, N=10): class TestJit(unittest.TestCase): @settings(deadline=2e4) - @unittest.skipUnless(Device.DEFAULT in ["LLVM", "CLANG"], f"no support on {Device.DEFAULT}") + @unittest.skipUnless(Device.DEFAULT in ["LLVM", "CPU"], f"no support on {Device.DEFAULT}") @given(strat.sampled_from([Tensor.exp2, Tensor.log2, Tensor.sin])) def test_approx_jit_timeout(self, op): with Context(TRANSCENDENTAL=2): @@ -497,8 +497,8 @@ class TestCopyInsideJit(unittest.TestCase): @TinyJit def add(x,y) -> Tensor: return x.to(Device.DEFAULT)+y for _ in range(5): - # create a Tensor in CLANG - a = Tensor.rand(16,16,device="CLANG").realize() + # create a Tensor on CPU + a = Tensor.rand(16,16,device="CPU").realize() b = Tensor.rand(16,16).realize() out = add(a,b) np.testing.assert_allclose(out.flatten().tolist(), [x+y for x,y in zip(a.flatten().tolist(), b.flatten().tolist())]) @@ -529,12 +529,12 @@ class TestJitPrune(unittest.TestCase): w2_prune = TinyJit(w2, prune=True) for _ in range(3): - a = Tensor.rand(16, device="CLANG").realize() + a = Tensor.rand(16, device="CPU").realize() out = w2_noprune(a) np.testing.assert_allclose(out.tolist(), [x*2+y for x,y in zip(weights.tolist(), a.tolist())]) for _ in range(3): - a = Tensor.rand(16, device="CLANG").realize() + a = Tensor.rand(16, device="CPU").realize() out = w2_prune(a) np.testing.assert_allclose(out.tolist(), [x*2+y for x,y in zip(weights.tolist(), a.tolist())]) diff --git a/test/test_kernel_cache.py b/test/test_kernel_cache.py index 851f4d83fd..164b501a41 100644 --- a/test/test_kernel_cache.py +++ b/test/test_kernel_cache.py @@ -5,7 +5,7 @@ from tinygrad import Device class TestKernelCache(unittest.TestCase): def test_kernel_cache_in_action(self): - if Device.DEFAULT not in ["CLANG"]: + if Device.DEFAULT not in ["CPU"]: self.skipTest("No custom kernel cache is implemented") unique_const = 0.6765677269 @@ -16,14 +16,14 @@ class TestKernelCache(unittest.TestCase): a1 = Tensor.rand(4,4).realize() b1 = Tensor.rand(4,4).realize() - orig_compile_func = Device['CLANG'].compiler - Device['CLANG'].compiler = None # making it not callable + orig_compile_func = Device['CPU'].compiler + Device['CPU'].compiler = None # making it not callable try: x1 = a1 + b1 + unique_const x1.realize() # Same kernel should be from cache. finally: - Device['CLANG'].compiler = orig_compile_func + Device['CPU'].compiler = orig_compile_func if __name__ == "__main__": unittest.main() diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 9c718b279e..9efa36b038 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -64,7 +64,7 @@ def helper_tc_ensure_uops_and_opts_count(n: int, m:int, k:int, dtype_in:DType, d class TestLinearizer(unittest.TestCase): def test_arg_dedup(self): # NOTE: this realize exists because Tensor.numpy calls .contiguous() internally - # without contiguous folding, rand.to("CLANG") and rand.contiguous().to("CLANG") are different UOps. + # without contiguous folding, rand.to("CPU") and rand.contiguous().to("CPU") are different UOps. # this test asserts they are the identical Buffer # having different buffers is fine for correctness, because the outputs match. a, b = Tensor.randn(4).realize(), Tensor.randn(4).realize() @@ -983,8 +983,8 @@ class TestLinearizer(unittest.TestCase): # NOTE: can reenable, it does work. it just makes BEAM slow @unittest.expectedFailure - @unittest.skipUnless(Device.DEFAULT == "CLANG", "test only for CLANG") - def test_upcast_with_locals_clang(self): + @unittest.skipUnless(Device.DEFAULT == "CPU", "test only for CPU") + def test_upcast_with_locals_cpu(self): out = Tensor.ones(64,64).contiguous() @ Tensor.ones(64,64).contiguous() k = Kernel(out.schedule()[-1].ast) k.apply_opt(Opt(OptOps.LOCAL, axis=0, arg=4)) @@ -1136,7 +1136,7 @@ class TestLinearizer(unittest.TestCase): assert u.src[-1].src[0].op != Ops.ASSIGN @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") - @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"}, "CLANG does not support using a different type for accumulation") + @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation") def test_tensor_cores_unroll_casted_phi(self): tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0] x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in) @@ -1148,7 +1148,7 @@ class TestLinearizer(unittest.TestCase): assert u.src[-1].src[0].op != Ops.ASSIGN @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") - @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"}, "CLANG does not support using a different type for accumulation") + @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation") def test_tensor_cores_unroll_casted_phi_with_children(self): # all ASSIGN children are outside the loop tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0] @@ -1445,7 +1445,7 @@ class TestFloat4(unittest.TestCase): assert TestFloat4.count_float4(k) == (2, 1) - @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "CLANG with AMX upcasts float up to size 16") + @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "CPU with AMX upcasts float up to size 16") def test_float4_multidim(self): a = Tensor.rand(2, 8).realize() b = Tensor.rand(2, 8).realize() @@ -1462,7 +1462,7 @@ class TestFloat4(unittest.TestCase): assert TestFloat4.count_float4(k) == (4, 2) - @unittest.skipUnless(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "Only CLANG with AMX upcasts float up to size 16") + @unittest.skipUnless(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "Only CPU with AMX upcasts float up to size 16") def test_float4_multidim_amx(self): def kernel_for_shape(size, shift): a = Tensor.rand(2, size).realize() @@ -1487,7 +1487,7 @@ class TestFloat4(unittest.TestCase): for i in range(len(sizes)): assert TestFloat4.count_float4(kernel_for_shape(sizes[i], shifts[i]), excepted_upcast_size[i]) == expected_output[i] - @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "CLANG with AMX upcasts float up to size 16") + @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "CPU with AMX upcasts float up to size 16") def test_float4_unaligned_load(self): a = Tensor.rand(9).realize().shrink(((1, 9),)) b = Tensor.rand(9).realize().shrink(((1, 9),)) @@ -1500,7 +1500,7 @@ class TestFloat4(unittest.TestCase): assert TestFloat4.count_float4(k) == (0, 1) - @unittest.skipIf(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "CLANG with AMX upcasts float up to size 16") + @unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "CPU with AMX upcasts float up to size 16") def test_float4_multidim_unaligned_load(self): a = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),)) b = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),)) @@ -1517,7 +1517,7 @@ class TestFloat4(unittest.TestCase): assert TestFloat4.count_float4(k) == (0, 2) - @unittest.skipUnless(Device.DEFAULT in {"CLANG", "LLVM"} and AMX, "Only CLANG with AMX upcasts float up to size 16") + @unittest.skipUnless(Device.DEFAULT in {"CPU", "LLVM"} and AMX, "Only CPU with AMX upcasts float up to size 16") def test_float4_multidim_unaligned_load_amx(self): def kernel_for_shape(size, shift): a = Tensor.rand(2, size).realize().shrink(((0, 2), (1, size),)) diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py index b37bb524b3..6e70530b29 100644 --- a/test/test_linearizer_failures.py +++ b/test/test_linearizer_failures.py @@ -498,7 +498,7 @@ class TestLinearizerFailures(unittest.TestCase): opts = [Opt(op=OptOps.PADTO, axis=0, arg=32)] helper_test_lin(Kernel(ast), opts, failed_platforms=[]) - #@unittest.skipIf(Device.DEFAULT in ("LLVM", "METAL", "CLANG"), "flaky") + #@unittest.skipIf(Device.DEFAULT in ("LLVM", "METAL", "CPU"), "flaky") @unittest.skip("flaky everywhere") def test_failure_22(self): ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( diff --git a/test/test_pickle.py b/test/test_pickle.py index 3bfae36c89..be09ed7d57 100644 --- a/test/test_pickle.py +++ b/test/test_pickle.py @@ -38,7 +38,7 @@ class TestPickle(unittest.TestCase): def test_pickle_realized_tensor_alt(self): print("** init") - t = Tensor.rand(10, 10).to("CLANG").realize() + t = Tensor.rand(10, 10).to("CPU").realize() st = pickle.dumps(t) t_values = t.numpy() del t # free buffers @@ -50,7 +50,7 @@ class TestPickle(unittest.TestCase): def test_pickle_realized_tensor_alt2(self): print("** init") - t = Tensor.rand(10, 10).to("CLANG").realize() + t = Tensor.rand(10, 10).to("CPU").realize() tensor_uop = t.lazydata assert tensor_uop.is_realized, f"expected {tensor_uop} to be realized" t_values = t.numpy() @@ -93,7 +93,7 @@ class TestPickle(unittest.TestCase): np.testing.assert_equal(vt2.numpy(), 20) def test_pickle_buffer_view(self): - t = Tensor.arange(10, device="CLANG").contiguous().realize() + t = Tensor.arange(10, device="CPU").contiguous().realize() vt = t[3:5].contiguous().realize() assert hasattr(vt.lazydata.buffer, 'base') ref_value = vt.tolist() diff --git a/test/test_randomness.py b/test/test_randomness.py index fa351082a9..0581ee65ff 100644 --- a/test/test_randomness.py +++ b/test/test_randomness.py @@ -239,7 +239,7 @@ class TestRandomness(unittest.TestCase): numpy_func=lambda x: np.random.randint(low=-2, high=5, size=x))) self.assertTrue(equal_distribution(partial(Tensor.randint, low=-2, high=5, dtype="int32"), numpy_func=lambda x: np.random.randint(low=-2, high=5, size=x))) - self.assertTrue(Tensor.randint(1, device="CLANG").device=="CLANG") + self.assertTrue(Tensor.randint(1, device="CPU").device=="CPU") # check types of args with self.assertRaises(TypeError): Tensor.randint((3, 4), low=0.1, high=3) with self.assertRaises(TypeError): Tensor.randint((3, 4), low=0, high=3.5) diff --git a/test/test_renderer_failures.py b/test/test_renderer_failures.py index 0868062e03..ebe8a19329 100644 --- a/test/test_renderer_failures.py +++ b/test/test_renderer_failures.py @@ -38,7 +38,7 @@ class TestCStyleFailures(unittest.TestCase): store = UOp.store(a.index(idx), alu) sink = UOp(Ops.SINK, dtypes.void, (store,)) uops = linearize_uop(full_graph_rewrite(sink, Device[Device.DEFAULT].renderer)) - # CLANG doesn't use the max function + # CPU doesn't use the max function ret = _test_uop_result([Tensor([1])], uops)[0] self.assertEqual(ret[0], 1) diff --git a/test/test_schedule.py b/test/test_schedule.py index f877ef076d..f4ae67bf3f 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -1732,16 +1732,16 @@ class TestIndexing(unittest.TestCase): self.assertIs(sched[1].ast.op, Ops.BUFFER_VIEW) np.testing.assert_equal(a.numpy(), [[4, 5]]) - @unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from ext device") + @unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device") def test_arange_shrink_copy(self): - a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).to("CLANG") + a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).to("CPU") sched = self.check_schedule(a, 1) self.assertIs(sched[-1].ast.op, Ops.COPY) np.testing.assert_equal(a.numpy(), [[4, 5]]) - @unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from ext device") + @unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device") def test_arange_expand_copy(self): - a = Tensor.arange(4).reshape(2, 2, 1).expand(2, 2, 2).contiguous().to("CLANG") + a = Tensor.arange(4).reshape(2, 2, 1).expand(2, 2, 2).contiguous().to("CPU") sched = self.check_schedule(a, 1) self.assertIs(sched[1].ast.op, Ops.COPY) self.assertIs(sched[0].ast.src[0].src[2].op, Ops.ADD) @@ -2279,23 +2279,23 @@ class TestConst(unittest.TestCase): run_schedule(sched, var_vals) self.assertEqual(a.tolist(), 3) -@unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from another device to clang") +@unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from another device to cpu") class TestCopyFolding(unittest.TestCase): def test_const_copy_is_free(self): - b = Tensor(1).to("CLANG") + b = Tensor(1).to("CPU") check_schedule(b, 0, filter_sink=False) assert b.item() == 1 def test_late_const_copy_folding(self): a = Tensor.arange(3).realize() zeros = Tensor.zeros(3).realize() - b = (a*zeros).to("CLANG") + b = (a*zeros).to("CPU") run_schedule(check_schedule(b, 0, filter_sink=False)) self.assertListEqual(b.tolist(), [0, 0, 0]) def test_alu_after_copy(self): - a = Tensor.ones((4,)).to("CLANG").lazydata - b = Tensor.empty(4, device="CLANG").lazydata + a = Tensor.ones((4,)).to("CPU").lazydata + b = Tensor.empty(4, device="CPU").lazydata add = a+b add = schedule_graph_rewrite(add) assert all_same([x.device for x in add.src]), f"ALU has different devices! {[x.device for x in add.src]}" @@ -2348,13 +2348,13 @@ class TestCopyFolding(unittest.TestCase): def test_permute_on_disk(self): with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().lazydata.base.buffer.as_buffer()) a = Tensor.empty(4, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_4_permute')}") - b = a.reshape(2, 2).permute(1, 0).to("CLANG") + b = a.reshape(2, 2).permute(1, 0).to("CPU") b.realize() self.assertListEqual(b.tolist(), [[0, 2], [1, 3]]) def test_permute_after_shrink(self): a = Tensor.arange(5) - b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CLANG") + b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CPU") b.realize() self.assertListEqual(b.tolist(), [[0, 2], [1, 3]]) @@ -2364,7 +2364,7 @@ class TestCopyFolding(unittest.TestCase): def test_permute_after_shrink_on_disk(self): with open(temp('dt_arange_5_permute'), "wb") as f: f.write(Tensor.arange(5).realize().lazydata.base.buffer.as_buffer()) a = Tensor.empty(5, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_5_permute')}") - b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CLANG") + b = a.shrink(((0, 4),)).reshape(2, 2).permute(1, 0).to("CPU") b.realize() self.assertListEqual(b.tolist(), [[0, 2], [1, 3]]) diff --git a/test/test_tensor.py b/test/test_tensor.py index 0728a3f41b..7a7d21dabf 100644 --- a/test/test_tensor.py +++ b/test/test_tensor.py @@ -247,7 +247,7 @@ class TestTinygrad(unittest.TestCase): assert a.shape == b.shape, f"shape mismatch {a.shape} != {b.shape}" def test_rand_like_device(self): - a = Tensor.ones(3, 3, device="CLANG") + a = Tensor.ones(3, 3, device="CPU") b = Tensor.rand_like(a) self.assertEqual(b.device, a.device) @@ -326,7 +326,7 @@ class TestTinygrad(unittest.TestCase): def test_tensor_from_blob(self): x = memoryview(bytearray(16)).cast('I') - t = Tensor.from_blob(mv_address(x), (4,), dtype=dtypes.int, device="CLANG") + t = Tensor.from_blob(mv_address(x), (4,), dtype=dtypes.int, device="CPU") z = (t+1) np.testing.assert_equal(z.numpy(), [1, 1, 1, 1]) @@ -695,7 +695,7 @@ class TestZeroShapeTensor(unittest.TestCase): class TestTensorCreationDevice(unittest.TestCase): # test auxiliary tensors are created on the same device def test_one_hot(self): - y = Tensor([1, 2, 3]).to("CLANG") + y = Tensor([1, 2, 3]).to("CPU") x = y.one_hot(10) x.realize() diff --git a/test/test_uop_graph.py b/test/test_uop_graph.py index ee06194af2..7513bd70bd 100644 --- a/test/test_uop_graph.py +++ b/test/test_uop_graph.py @@ -661,7 +661,7 @@ class TestLoadStoreFolder(unittest.TestCase): sink = float4_rewrite(sink.sink()) assert len([x for x in sink.toposort if x.op is Ops.LOAD]) == 1 - @unittest.skipIf(Device.DEFAULT in {"CLANG"} and AMX, "CLANG with AMX upcasts float up to size 16") + @unittest.skipIf(Device.DEFAULT in {"CPU"} and AMX, "CPU with AMX upcasts float up to size 16") def test_two_load_fold(self): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr()) load = [UOp(Ops.LOAD, dtypes.float, (buf.index(UOp.const(dtypes.int, i)),)) for i in range(8)] diff --git a/test/test_uops.py b/test/test_uops.py index ef9952c5ba..5a2c476822 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -106,11 +106,11 @@ class TestUOps(unittest.TestCase): self._equal(f([a,b,c], op, dts), fxn(a,b,c)) class TestFloatUOps(TestUOps): - @unittest.skipIf(Device.DEFAULT == "CLANG", 'not supported as uop') + @unittest.skipIf(Device.DEFAULT == "CPU", 'not supported as uop') def test_exp2(self): self._test_uop_fxn(Ops.EXP2, lambda a: np.exp2(a)) - @unittest.skipIf(Device.DEFAULT == "CLANG", 'not supported as uop') + @unittest.skipIf(Device.DEFAULT == "CPU", 'not supported as uop') def test_log2(self): self._test_uop_fxn(Ops.LOG2, lambda a: math.log2(a) if a > 0 else float('-inf' if a==0 else 'nan')) - @unittest.skipIf(Device.DEFAULT == "CLANG", 'not supported as uop') + @unittest.skipIf(Device.DEFAULT == "CPU", 'not supported as uop') def test_sin(self): self._test_uop_fxn(Ops.SIN, lambda a: math.sin(a)) def test_recip(self): self._test_uop_fxn(Ops.RECIP, lambda a: 1/a if a != 0 else float('inf')) def test_sqrt(self): self._test_uop_fxn(Ops.SQRT, lambda a: math.sqrt(a) if a >= 0 else float('nan')) diff --git a/test/test_uops_stats.py b/test/test_uops_stats.py index e224933ce3..3845629d45 100644 --- a/test/test_uops_stats.py +++ b/test/test_uops_stats.py @@ -65,12 +65,12 @@ class TestMemoryCount(unittest.TestCase): _, mem = get_stats(a.assign(a+a)) self.assertEqual(mem, 1024*1024*2) # 1 read + 1 write - @unittest.skipIf(Device.DEFAULT == "CLANG", "test copy to CLANG from other device") + @unittest.skipIf(Device.DEFAULT == "CPU", "test copy to CPU from other device") def test_copyout(self): - a = Tensor.empty(32, dtype=dtypes.uint8).to("CLANG") + a = Tensor.empty(32, dtype=dtypes.uint8).to("CPU") _, mem = get_stats(a) self.assertEqual(mem, 32*1) - a = Tensor.empty(32, dtype=dtypes.uint32).to("CLANG") + a = Tensor.empty(32, dtype=dtypes.uint32).to("CPU") _, mem = get_stats(a) self.assertEqual(mem, 32*4) diff --git a/test/test_zero_copy.py b/test/test_zero_copy.py index 1b999314c3..6f2b2cda0b 100644 --- a/test/test_zero_copy.py +++ b/test/test_zero_copy.py @@ -13,7 +13,7 @@ def time_tensor_numpy(out:Tensor): N = 4096 class TestZeroCopy(unittest.TestCase): - @unittest.skipIf(Device.DEFAULT not in {"CLANG", "LLVM", "METAL"}, "device isn't zero copy") + @unittest.skipIf(Device.DEFAULT not in {"CPU", "LLVM", "METAL"}, "device isn't zero copy") def test_zero_copy_from_default_to_cpu(self): demo = Tensor.rand(1).realize() t1 = time_tensor_numpy(demo) diff --git a/test/testextra/test_f16_decompress.py b/test/testextra/test_f16_decompress.py index 9cfa936a49..6077f67a90 100644 --- a/test/testextra/test_f16_decompress.py +++ b/test/testextra/test_f16_decompress.py @@ -7,7 +7,7 @@ import numpy as np class TestF16Decompression(unittest.TestCase): def test_u32_to_f16(self): - a = Tensor.randn(50, dtype=dtypes.float16, device=None if is_dtype_supported(dtypes.float16) else "CLANG:0") + a = Tensor.randn(50, dtype=dtypes.float16, device=None if is_dtype_supported(dtypes.float16) else "CPU") f16_as_u32 = a.bitcast(dtypes.uint32) if is_dtype_supported(dtypes.float16) else a.bitcast(dtypes.uint32).to(Device.DEFAULT) f16 = u32_to_f16(f16_as_u32) ref = a.numpy() diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py index 6291216d1d..8f37ff7eb0 100644 --- a/test/unit/test_disk_tensor.py +++ b/test/unit/test_disk_tensor.py @@ -211,7 +211,7 @@ class TestSafetensors(unittest.TestCase): def helper_test_disk_tensor(fn, data, np_fxn, tinygrad_fxn=None): if tinygrad_fxn is None: tinygrad_fxn = np_fxn pathlib.Path(temp(fn)).unlink(missing_ok=True) - tinygrad_tensor = Tensor(data, device="CLANG").to(f"disk:{temp(fn)}") + tinygrad_tensor = Tensor(data, device="CPU").to(f"disk:{temp(fn)}") numpy_arr = np.array(data) tinygrad_fxn(tinygrad_tensor) np_fxn(numpy_arr) @@ -251,7 +251,7 @@ class TestDiskTensor(unittest.TestCase): def test_write_ones(self): pathlib.Path(temp("dt_write_ones")).unlink(missing_ok=True) - out = Tensor.ones(10, 10, device="CLANG").contiguous() + out = Tensor.ones(10, 10, device="CPU").contiguous() outdisk = out.to(f"disk:{temp('dt_write_ones')}") print(outdisk) outdisk.realize() @@ -289,13 +289,13 @@ class TestDiskTensor(unittest.TestCase): def test_bitcast(self): with open(temp('dt_bitcast'), "wb") as f: f.write(bytes(range(10,20))) t = Tensor.empty(5, dtype=dtypes.int16, device=f"disk:{temp('dt_bitcast')}") - ret = t.to("CLANG").bitcast(dtypes.uint16) + 1 + ret = t.to("CPU").bitcast(dtypes.uint16) + 1 assert ret.tolist() == [2827, 3341, 3855, 4369, 4883] def test_bitcast_view(self): with open(temp('dt_bitcast_view'), "wb") as f: f.write(bytes(range(10, 24))) t = Tensor.empty(3, dtype=dtypes.uint, device=f"disk:{temp('dt_bitcast_view')}").shrink([(0, 2)]) - ret = t.bitcast(dtypes.uint16).to("CLANG") + 1 + ret = t.bitcast(dtypes.uint16).to("CPU") + 1 assert ret.tolist() == [2827, 3341, 3855, 4369] @unittest.skipIf(OSX, "new LLVM has an issue on OSX") @@ -363,10 +363,10 @@ class TestPathTensor(unittest.TestCase): np.testing.assert_array_equal(t.numpy(), np.frombuffer(self.test_data, dtype=np.uint8)) def test_path_tensor_with_device(self): - t = Tensor(self.test_file, device="CLANG") + t = Tensor(self.test_file, device="CPU") self.assertEqual(t.shape, (100,)) self.assertEqual(t.dtype, dtypes.uint8) - self.assertEqual(t.device, "CLANG") + self.assertEqual(t.device, "CPU") np.testing.assert_array_equal(t.numpy(), np.frombuffer(self.test_data, dtype=np.uint8)) def test_path_tensor_empty_file(self): @@ -391,8 +391,8 @@ class TestPathTensor(unittest.TestCase): def test_path_tensor_copy_to_device(self): t = Tensor(self.test_file) - t_cpu = t.to("CLANG") - self.assertEqual(t_cpu.device, "CLANG") + t_cpu = t.to("CPU") + self.assertEqual(t_cpu.device, "CPU") np.testing.assert_array_equal(t_cpu.numpy(), np.frombuffer(self.test_data, dtype=np.uint8)) if __name__ == "__main__": diff --git a/test/unit/test_elf.py b/test/unit/test_elf.py index f55b11e17e..cdf02b7728 100644 --- a/test/unit/test_elf.py +++ b/test/unit/test_elf.py @@ -1,5 +1,5 @@ import unittest, subprocess, platform -from tinygrad.runtime.ops_clang import ClangJITCompiler +from tinygrad.runtime.ops_cpu import ClangJITCompiler from tinygrad.runtime.support.elf import elf_loader class TestElfLoader(unittest.TestCase): diff --git a/test/unit/test_verify_ast.py b/test/unit/test_verify_ast.py index ccae3810b3..55adb79326 100644 --- a/test/unit/test_verify_ast.py +++ b/test/unit/test_verify_ast.py @@ -91,7 +91,7 @@ class TestVerifyAST(unittest.TestCase): def test_const_view_always_valid(self): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), 0) - a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg="CLANG"),), ShapeTracker.from_shape(())),)) + a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg="CPU"),), ShapeTracker.from_shape(())),)) st = UOp.store(buf, ShapeTracker.from_shape(()).to_uop(), a.cast(dtypes.float)) helper_test_verify_ast(st) diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 027ac28791..23fcdff637 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -374,7 +374,7 @@ class Kernel: check(smem_sz <= self.opts.shared_max, f"exceeds maximum shared memory size: needs {smem_sz}, max {self.opts.shared_max}") if opt.op is OptOps.LOCAL: # cyan - # NOTE: LLVM/CLANG can use locals too, but they are treated the same as globals (still helpful for L1 cache) + # NOTE: LLVM/CPU can use locals too, but they are treated the same as globals (still helpful for L1 cache) # it's disabled for now since it makes BEAM slow for little gain check(self.opts.has_local, "target does not support local") check(axis < self.global_dims, "local is for globals") diff --git a/tinygrad/device.py b/tinygrad/device.py index eb60565fdc..ce08cbc081 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -10,7 +10,7 @@ from tinygrad.renderer import Renderer # **************** Device **************** -ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CLANG", "LLVM", "DSP", "WEBGPU"] +ALL_DEVICES = ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CPU", "LLVM", "DSP", "WEBGPU"] class _Device: def __init__(self) -> None: self._devices = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")] diff --git a/tinygrad/nn/state.py b/tinygrad/nn/state.py index 99032cfe63..024ae6aba4 100644 --- a/tinygrad/nn/state.py +++ b/tinygrad/nn/state.py @@ -195,8 +195,8 @@ def torch_load(t:Tensor) -> dict[str, Tensor]: if tuple(permute_indexes) != tuple(range(len(permute_indexes))): intermediate_shape = tuple([shape_strides[x][0] for x in argsort(permute_indexes)]) assert tuple([shape_strides[i][1] for i in argsort(permute_indexes)]) == strides_for_shape(intermediate_shape), "nonpermutable strides" - if DEBUG >= 3: print(f"WARNING: this torch load is slow. CLANG to permute {intermediate_shape} with {permute_indexes}") - assert storage[1] != dtypes.bfloat16, "can't CLANG permute BF16" + if DEBUG >= 3: print(f"WARNING: this torch load is slow. to permute {intermediate_shape} with {permute_indexes}") + assert storage[1] != dtypes.bfloat16, "can't permute BF16" # TODO: find a nice way to support all shapetracker on disktensors ret = ret.to(None).reshape(intermediate_shape).permute(permute_indexes) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 4b7fde290a..7b16ffb616 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -181,7 +181,7 @@ class GroupOp: All = set(Ops) # some BUFFER ops can be processed with only a view -view_supported_devices = {"LLVM", "CLANG", "CUDA", "NV", "AMD", "METAL", "QCOM", "DSP", "DISK"} +view_supported_devices = {"LLVM", "CPU", "CUDA", "NV", "AMD", "METAL", "QCOM", "DSP", "DISK"} # https://en.wikipedia.org/wiki/Identity_element def identity_element(op:Ops, dt:DType) -> ConstType: return dtypes.as_const({Ops.ADD:0, Ops.MUL:1, Ops.MAX:dtypes.min(dt)}[op], dt) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 9f0cfd2ca8..bcda50770f 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -18,7 +18,7 @@ base_rewrite = PatternMatcher([ lambda ctx,x: f"for ({ctx.render_dtype(x.dtype)} {ctx[x]} = {ctx[x.src[0]]}; {ctx[x]} < {ctx[x.src[1]]}; {ctx[x]}++) {{"), (UPat(Ops.VECTORIZE, name="x"), lambda ctx,x: f"{ctx.float4.replace('float4', ctx.render_dtype(x.dtype))}" + \ - (f"{{{','.join([ctx[y] for y in x.src])}}}" if ctx.device in {'CLANG', 'DSP'} else f"({','.join([ctx[y] for y in x.src])})")), + (f"{{{','.join([ctx[y] for y in x.src])}}}" if ctx.device in {'CPU', 'DSP'} else f"({','.join([ctx[y] for y in x.src])})")), (UPat(Ops.CAST, name="x"), lambda ctx,x: f"__builtin_convertvector({ctx[x.src[0]]}, {ctx.render_dtype(x.dtype)})" if x.dtype.count > 1 and not isinstance(x.dtype, PtrDType) else None), (UPat(Ops.CAST, name="x"), lambda ctx,x: f"({ctx.render_cast(x.dtype, ctx[x.src[0]])})"), @@ -52,7 +52,7 @@ base_rewrite = PatternMatcher([ (UPat(GroupOp.ALU, name="x"), lambda ctx,x: ctx.code_for_op[x.op]( *([strip_parens(ctx[v]) if v.op == x.op and x.op in {Ops.ADD, Ops.MUL, Ops.XOR} else ctx[v] for v in x.src]), x.dtype)), (UPat(Ops.GEP, name="x"), lambda ctx,x: ctx[x.src[0]] + \ - (f"[{x.arg[0]}]" if x.src[0].dtype.count > (8 if ctx.device in {"CUDA", "NV"} else 4) or ctx.device in {'CLANG', 'DSP'} else \ + (f"[{x.arg[0]}]" if x.src[0].dtype.count > (8 if ctx.device in {"CUDA", "NV"} else 4) or ctx.device in {'CPU', 'DSP'} else \ f".{'xyzwabcd'[x.arg[0]]}")), # custom passes through with format (UPat(Ops.CUSTOM, name="x"), lambda ctx,x: x.arg.format(*[ctx[y] for y in x.src])), @@ -175,7 +175,7 @@ class CStyleLanguage(Renderer): return self.render_kernel(name, kernel, list(bufs.values()), uops) class ClangRenderer(CStyleLanguage): - device = "CLANG" + device = "CPU" float4 = "(float4)" has_local = False global_max = None diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_cpu.py similarity index 98% rename from tinygrad/runtime/ops_clang.py rename to tinygrad/runtime/ops_cpu.py index 58eb84b591..b8b34f62c8 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_cpu.py @@ -20,3 +20,5 @@ class ClangJITCompiler(Compiler): class ClangDevice(Compiled): def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram) + +CPUDevice = ClangDevice \ No newline at end of file diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index 7917928652..3256d88c33 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -17,7 +17,7 @@ class LLVMCompiler(Compiler): triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf' target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt) - # +reserve-x18 here does the same thing as -ffixed-x18 in ops_clang.py, see comments there for why it's needed on arm osx + # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()) if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}") self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats, diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 039ed786c2..427340cb61 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -173,7 +173,7 @@ class PythonProgram: # C, D (8 elements on 8 threads) def c_map(lane, elem): return (lane, elem) ul[i] = wmma_helper(8, 16, 16, 16, 8, a_elem, b_elem, c_map) - elif arg[4] == "CLANG": + elif arg[4] == "CPU": def elem(x, col, row, _): return x[col+row][0] # k is always 0 def c_map(_, elem): return (elem%16, elem//16) ul[i] = wmma_helper(1, 1, 16, 16, 256, elem, elem, c_map) @@ -194,7 +194,7 @@ class PythonRenderer(Renderer): if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm80 if getenv("EMULATE_CUDA_SM75"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm75 if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores - if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CLANG", ClangRenderer.tensor_cores + if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CPU", ClangRenderer.tensor_cores def render(self, uops:list[UOp]) -> str: lops = [(u.op, u.dtype, [uops.index(v) for v in u.src], u.arg) for u in uops] diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 303a747703..a703587078 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -274,7 +274,7 @@ class Tensor(SimpleMathTrait): def assign(self, x) -> Tensor: # TODO: this is a hack for writing to DISK. remove with working assign if isinstance(self.device, str) and self.device.startswith("DISK"): - if x.__class__ is not Tensor: x = Tensor(x, device="CLANG", dtype=self.dtype) + if x.__class__ is not Tensor: x = Tensor(x, device="CPU", dtype=self.dtype) self.contiguous().realize().lazydata.base.realized.ensure_allocated().copyin(x._data()) return self if x.__class__ is not Tensor: x = Tensor(x, device=self.device, dtype=self.dtype) @@ -297,11 +297,11 @@ class Tensor(SimpleMathTrait): def _data(self) -> memoryview: if 0 in self.shape: return memoryview(bytearray(0)) # NOTE: this realizes on the object from as_buffer being a Python object - cpu = self.cast(self.dtype.base).contiguous().to("CLANG").realize() + cpu = self.cast(self.dtype.base).contiguous().to("CPU").realize() buf = cast(UOp, cpu.lazydata).base.realized assert buf is not None, f"{cast(UOp, cpu.lazydata).base} was not realized" - if self.device != "CLANG": buf.options = BufferSpec(nolru=True) - return buf.as_buffer(allow_zero_copy=True if self.device != "CLANG" else False) + if self.device != "CPU": buf.options = BufferSpec(nolru=True) + return buf.as_buffer(allow_zero_copy=True if self.device != "CPU" else False) def data(self) -> memoryview: """ @@ -520,8 +520,8 @@ class Tensor(SimpleMathTrait): if (numel := prod(shape)) == 0: return Tensor.zeros(shape, device=_device, dtype=dtype, **kwargs) num = ceildiv(numel * dtype.itemsize, 4) - # when using MOCKGPU and NV generate rand on CLANG - if getenv("MOCKGPU") and device.startswith("NV"): device = "CLANG" + # when using MOCKGPU and NV generate rand on CPU + if getenv("MOCKGPU") and device.startswith("NV"): device = "CPU" # generate per device seeds and rng counter if we haven't seen this device yet if device not in Tensor._device_seeds: