move dtypes to dtype.py (#2964)

* move dtypes to dtype.py * fix urllib
2026-01-09 23:18:04 -05:00 · 2024-01-01 14:58:48 -08:00
parent fadaa2ec28
commit a280cfe169
74 changed files with 219 additions and 289 deletions
--- a/docs/abstractions.py
+++ b/docs/abstractions.py
@@ -67,7 +67,7 @@ class Function:
 # %%
 # == LazyBuffer (in tinygrad/lazy.py, code 5/10) ==
-from tinygrad.helpers import DType
+from tinygrad.dtype import DType
 # this is where the properties live that you thought were a part of Tensor
 # LazyBuffer is like a Tensor without derivatives, at the mlop layer
--- a/docs/abstractions2.py
+++ b/docs/abstractions2.py
@@ -37,7 +37,7 @@ print("******** second, the Device ***********")
 DEVICE = "CLANG"   # NOTE: you can change this!
 import struct
-from tinygrad.helpers import dtypes
+from tinygrad.dtype import dtypes
 from tinygrad.device import Buffer, Device
 from tinygrad.ops import LazyOp, BufferOps, MemBuffer, BinaryOps
 from tinygrad.shape.shapetracker import ShapeTracker
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -55,7 +55,7 @@ There are even more of these factory methods, you can find them in the [tensor.p
 All the tensors creation methods can take a `dtype` argument to specify the data type of the tensor.
 ```python
-from tinygrad.helpers import dtypes
+from tinygrad.dtype import dtypes
 t3 = Tensor([1, 2, 3, 4, 5], dtype=dtypes.int32)
 ```
--- a/examples/beautiful_cartpole.py
+++ b/examples/beautiful_cartpole.py
@@ -1,7 +1,6 @@
 from typing import Tuple
 import time
-from tinygrad import Tensor, TinyJit, nn, Variable
+from tinygrad import Tensor, TinyJit, nn, Variable, dtypes
 from tinygrad.helpers import dtypes  # TODO: wouldn't need this if argmax returned the right dtype
 import gymnasium as gym
 from tqdm import trange
 import numpy as np  # TODO: remove numpy import
--- a/examples/conversation.py
+++ b/examples/conversation.py
@@ -16,8 +16,8 @@ from vits import Y_LENGTH_ESTIMATE_SCALARS, HParams, Synthesizer, TextMapper, ge
 from whisper import init_whisper, transcribe_waveform
 from sentencepiece import SentencePieceProcessor
-from tinygrad.helpers import Timing, dtypes, fetch
+from tinygrad.helpers import Timing, fetch
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor, dtypes
 # Whisper constants
 RATE = 16000
--- a/examples/f16_w_uint32.py
+++ b/examples/f16_w_uint32.py
@@ -1,7 +1,5 @@
 import numpy as np
-from tinygrad.tensor import Tensor
+from tinygrad import Device, dtypes, Tensor
 from tinygrad.helpers import dtypes
 from tinygrad import Device
 # TODO: will be better when tinygrad does math in the target dtype, can remove the floor and use a mul
 def bit_extract(x, s, e) -> Tensor:
--- a/examples/gpt2.py
+++ b/examples/gpt2.py
@@ -4,13 +4,13 @@ from tqdm import trange
 import numpy as np
 from tinygrad import Device, GlobalCounters
 from typing import Optional, Union
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor, dtypes
 from tinygrad.nn import Embedding, Linear, LayerNorm
 from tinygrad.shape.symbolic import Variable
 from tinygrad.jit import TinyJit
 import tiktoken
 from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
-from tinygrad.helpers import Timing, DEBUG, getenv, fetch, colored, dtypes
+from tinygrad.helpers import Timing, DEBUG, getenv, fetch, colored
 MAX_CONTEXT = getenv("MAX_CONTEXT", 128)
 HALF = getenv("HALF")
--- a/examples/hlb_cifar10.py
+++ b/examples/hlb_cifar10.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # setup for distributed
 from extra import dist
-from tinygrad.helpers import getenv, dtypes
+from tinygrad.helpers import getenv
 if __name__ == "__main__":
  if getenv("DIST"):
    dist.preinit()
@@ -14,7 +14,7 @@ import random, time
 import numpy as np
 from typing import Any, Dict, Optional, SupportsIndex
 from extra.datasets import fetch_cifar, cifar_mean, cifar_std
-from tinygrad import nn
+from tinygrad import nn, dtypes
 from tinygrad.nn.state import get_state_dict
 from tinygrad.nn import optim
 from tinygrad import Device, GlobalCounters
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -7,8 +7,8 @@ from pathlib import Path
 import sys, argparse, json
 import numpy as np
 np.set_printoptions(linewidth=200)
-from tinygrad.helpers import Timing, Profiling, getenv, DEBUG, dtypes, colored
+from tinygrad.helpers import Timing, Profiling, getenv, DEBUG, colored
-from tinygrad import Device, GlobalCounters
+from tinygrad import Device, GlobalCounters, dtypes
 from tinygrad.tensor import Tensor
 from tinygrad.nn.state import safe_load, torch_load, load_state_dict, get_parameters
 from extra.models.llama import Transformer, convert_from_huggingface
--- a/examples/mlperf/model_eval.py
+++ b/examples/mlperf/model_eval.py
@@ -1,9 +1,9 @@
 import time
 from pathlib import Path
 import numpy as np
-from tinygrad import Tensor, GlobalCounters
+from tinygrad import Tensor, GlobalCounters, dtypes
 from tinygrad.jit import TinyJit
-from tinygrad.helpers import getenv, dtypes
+from tinygrad.helpers import getenv
 from examples.mlperf import helpers
 def eval_resnet():
--- a/examples/so_vits_svc.py
+++ b/examples/so_vits_svc.py
@@ -4,9 +4,8 @@ import sys, logging, time, io, math, argparse, operator, numpy as np
 from functools import partial, reduce
 from pathlib import Path
 from typing import Tuple, Optional, Type
-from tinygrad import nn
+from tinygrad import nn, dtypes, Tensor
-from tinygrad.tensor import Tensor
+from tinygrad.helpers import getenv
 from tinygrad.helpers import dtypes, getenv
 from tinygrad.nn.state import torch_load
 from examples.vits import ResidualCouplingBlock, PosteriorEncoder, Encoder, ResBlock1, ResBlock2, LRELU_SLOPE, sequence_mask, split, download_if_not_present, get_hparams_from_file, load_checkpoint, weight_norm, HParams
 from examples.sovits_helpers import preprocess
--- a/examples/sovits_helpers/preprocess.py
+++ b/examples/sovits_helpers/preprocess.py
@@ -1,7 +1,6 @@
 import math
 from typing import Optional, Tuple
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor, dtypes
 from tinygrad.helpers import dtypes
 import librosa
 import soundfile
 import numpy as np
--- a/examples/stable_diffusion.py
+++ b/examples/stable_diffusion.py
@@ -9,9 +9,8 @@ from collections import namedtuple
 from PIL import Image
 import numpy as np
 from tqdm import tqdm
-from tinygrad.tensor import Tensor
+from tinygrad import Device, GlobalCounters, dtypes, Tensor
-from tinygrad import Device, GlobalCounters
+from tinygrad.helpers import Timing, Context, getenv, fetch, colored
 from tinygrad.helpers import dtypes, Timing, Context, getenv, fetch, colored
 from tinygrad.nn import Conv2d, Linear, GroupNorm, LayerNorm, Embedding
 from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
 from tinygrad.jit import TinyJit
--- a/examples/vits.py
+++ b/examples/vits.py
@@ -5,8 +5,8 @@ from phonemizer.punctuation import Punctuation
 from functools import reduce
 from pathlib import Path
 from typing import List
-from tinygrad import nn
+from tinygrad import nn, dtypes
-from tinygrad.helpers import dtypes, fetch
+from tinygrad.helpers import fetch
 from tinygrad.nn.state import torch_load
 from tinygrad.tensor import Tensor
 from tinygrad.jit import TinyJit
--- a/extra/archprobe.py
+++ b/extra/archprobe.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pickle
 from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer
-from tinygrad.helpers import dtypes
+from tinygrad import dtypes
 from tqdm import trange, tqdm
 from matplotlib import pyplot as plt
--- a/extra/assembly/assembly.py
+++ b/extra/assembly/assembly.py
@@ -1,7 +1,8 @@
 from typing import Tuple, List, NamedTuple, Any, Dict, Optional, Union, DefaultDict, cast
 from tinygrad.codegen.linearizer import UOps, MemOp, UOp
 from tinygrad.ops import BinaryOps, UnaryOps
-from tinygrad.helpers import DType, dtypes, DEBUG
+from tinygrad.dtype import DType, dtypes
 from tinygrad.helpers import DEBUG
 from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, LtNode, SumNode, AndNode
 import functools
 import math
--- a/extra/assembly/assembly_arm64.py
+++ b/extra/assembly/assembly_arm64.py
@@ -1,9 +1,10 @@
 import struct
 from platform import system
 from typing import Tuple, Dict, List, Optional
 from tinygrad import dtypes
 from tinygrad.ops import BinaryOps, UnaryOps, TernaryOps
 from tinygrad.codegen.linearizer import UOps, UOp
-from tinygrad.helpers import dtypes, CI
+from tinygrad.helpers import CI
 from tinygrad.codegen.assembly import uops_to_asmstyle, AssemblyLanguage
 def float_to_hex(x): return "%02X%02X%02X%02X" % tuple(struct.pack("f",x)[::-1])
--- a/extra/assembly/assembly_ptx.py
+++ b/extra/assembly/assembly_ptx.py
@@ -2,7 +2,7 @@ from typing import List
 import struct
 from tinygrad.codegen.assembly import uops_to_asmstyle, AssemblyLanguage
 from tinygrad.codegen.linearizer import UOps, UOp
-from tinygrad.helpers import dtypes
+from tinygrad import dtypes
 from tinygrad.ops import BinaryOps, UnaryOps, TernaryOps
 from tinygrad.runtime.ops_cuda import arch
--- a/extra/assembly/assembly_rdna.py
+++ b/extra/assembly/assembly_rdna.py
@@ -1,6 +1,6 @@
 import yaml
 from typing import Tuple, Set, Dict
-from tinygrad.helpers import dtypes
+from tinygrad import dtypes
 from tinygrad.codegen.assembly import AssemblyCodegen, Register
 from tinygrad.codegen.linearizer import UOps
 from tinygrad.ops import BinaryOps, UnaryOps, TernaryOps
--- a/extra/datasets/init.py
+++ b/extra/datasets/init.py
@@ -1,7 +1,7 @@
 import os, gzip, tarfile, pickle
 import numpy as np
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor, dtypes
-from tinygrad.helpers import dtypes, fetch
+from tinygrad.helpers import fetch
 def fetch_mnist(tensors=False):
  parse = lambda file: np.frombuffer(gzip.open(file).read(), dtype=np.uint8).copy()
--- a/extra/datasets/preprocess_imagenet.py
+++ b/extra/datasets/preprocess_imagenet.py
@@ -1,5 +1,4 @@
-from tinygrad.helpers import dtypes
+from tinygrad import Tensor, dtypes
 from tinygrad.tensor import Tensor
 from extra.datasets.imagenet import iterate, get_val_files
 if __name__ == "__main__":
--- a/extra/export_model.py
+++ b/extra/export_model.py
@@ -1,5 +1,5 @@
 from typing import Tuple, Dict, List
-from tinygrad.helpers import DType
+from tinygrad.dtype import DType
 from tinygrad.tensor import Device, Tensor
 from tinygrad.jit import TinyJit
 from tinygrad.nn.state import get_state_dict
--- a/extra/gemm/gemv_845.py
+++ b/extra/gemm/gemv_845.py
@@ -1,82 +0,0 @@
 old = """__kernel void re_S256_16_8( write_only image2d_t data0, read_only image2d_t data1, read_only image2d_t data2, __global float* data3 ) {
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int idx2 = get_global_id(0); /* 4 */
  int idx1 = get_global_id(1); /* 16 */
  int idx0 = get_global_id(2); /* 256 */
  float acc0 = 0.0f;
  for (int idx3 = 0; idx3 < 8; idx3++) {
    float4 val1_0 = read_imagef(data1, smp, (int2)(((idx1*8)+idx3), 0)) /* (1, 128, 4) */;
    float4 val2_0 = read_imagef(data2, smp, (int2)(((idx1*32)+(idx3*4)+idx2), idx0)) /* (256, 512, 4) */;
    acc0+=(val1_0.x*val2_0.x);
    acc0+=(val1_0.y*val2_0.y);
    acc0+=(val1_0.z*val2_0.z);
    acc0+=(val1_0.w*val2_0.w);
  }
  __local float temp[64];
  temp[((idx1*4)+idx2)] = acc0;
  barrier(CLK_LOCAL_MEM_FENCE);
  if (((idx1*4)+idx2) == 0) {
    float4 output0 = (float4)(0.0f,0.0f,0.0f,0.0f);
    for (int mid = 0; mid < 16; mid++) {
      float4 val5_0 = ((__local float4*)temp)[mid];
      output0.x+=val5_0.x;
      output0.y+=val5_0.y;
      output0.z+=val5_0.z;
      output0.w+=val5_0.w;
    }
    float4 val3_0 = ((__global float4*)data3)[idx0];
    write_imagef(data0, (int2)(idx0, 0), (float4)(max((output0.x+val3_0.x),(0.0f)),max((output0.y+val3_0.y),(0.0f)),max((output0.z+val3_0.z),(0.0f)),max((output0.w+val3_0.w),(0.0f))));  /* (1, 256, 4) */
  }
 }"""
 new = """__kernel void r_256_16_4_8_4(write_only image2d_t data0, read_only image2d_t data1, read_only image2d_t data2, const __global float* data3) {
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  __attribute__ ((aligned (16))) __local float temp[64];
  int gidx0 = get_group_id(0); /* 256 */
  int lidx1 = get_local_id(1); /* 16 */
  int lidx2 = get_local_id(0); /* 4 */
  float acc0 = 0.0f;
  for (int ridx0 = 0; ridx0 < 8; ++ridx0) {
    float4 val0 = read_imagef(data1, smp, (int2)(((lidx1*8)+ridx0),0));
    float4 val1 = read_imagef(data2, smp, (int2)(((lidx1*32)+lidx2+(ridx0*4)),gidx0));
    acc0 = (((val0).x*(val1).x)+acc0);
    acc0 = (((val0).y*(val1).y)+acc0);
    acc0 = (((val0).z*(val1).z)+acc0);
    acc0 = (((val0).w*(val1).w)+acc0);
  }
  temp[(lidx1*4)+lidx2] = acc0;
  barrier(CLK_LOCAL_MEM_FENCE);
  float4 acc1 = (float4)(0.0f,0.0f,0.0f,0.0f);
  for (int ridx1 = 0; ridx1 < 16; ++ridx1) {
    float4 val2 = (float4)(*((__local float4*)(temp+ridx1*4)));
    (acc1).x = ((val2).x+(acc1).x);
    (acc1).y = ((val2).y+(acc1).y);
    (acc1).z = ((val2).z+(acc1).z);
    (acc1).w = ((val2).w+(acc1).w);
  }
  float4 val3 = (float4)(*((__global float4*)(data3+gidx0*4)));
  write_imagef(data0, (int2)(gidx0,0), (float4)(max(((acc1).x+(val3).x),0.0f),max(((acc1).y+(val3).y),0.0f),max(((acc1).z+(val3).z),0.0f),max(((acc1).w+(val3).w),0.0f)));
 }"""
 from tinygrad.runtime.ops_gpu import CLBuffer, CLProgram
 from tinygrad.helpers import dtypes, prod
 if __name__ == "__main__":
  out = CLBuffer(prod((1, 128, 4)), dtypes.imageh((1,128,4)))
  x = CLBuffer(prod((1, 128, 4)), dtypes.imageh((1,128,4)))
  w = CLBuffer(prod((256, 512, 4)), dtypes.imageh((256, 512, 4)))
  b = CLBuffer(1024, dtypes.float)
  old = CLProgram("re_S256_16_8", old)
  new = CLProgram("r_256_16_4_8_4", new)
  old_tms = []
  new_tms = []
  for i in range(5):
    old_tms.append(old([1,1,256], [4,16,1], out, x, w, b, wait=True))
    new_tms.append(new([256,1,1], [4,16,1], out, x, w, b, wait=True))
  print(f"old: {min(old_tms)*1e6:.2f} us  new: {min(new_tms)*1e6:.2f} us")
--- a/extra/gemm/hip_matmul.py
+++ b/extra/gemm/hip_matmul.py
@@ -1,6 +1,7 @@
 import time
 import numpy as np
-from tinygrad.helpers import dtypes, getenv, prod, flat_mv
+from tinygrad import dtypes
 from tinygrad.helpers import getenv, prod, flat_mv
 from tinygrad.runtime.ops_hip import HIPAllocator, HIPProgram, compile_hip
 # AMD_LOG_LEVEL=3 ./MIOpenDriver gemm --iter 1000 --time 1 --a_w 2048 --a_h 2048 --b_w 2048
--- a/extra/gemm/metal_matmul.py
+++ b/extra/gemm/metal_matmul.py
@@ -2,8 +2,8 @@ import os
 os.environ["METAL"] = "1"
 import time
 import numpy as np
-from tinygrad.helpers import dtypes, getenv, flat_mv
+from tinygrad import Device, dtypes
-from tinygrad import Device
+from tinygrad.helpers import getenv, flat_mv
 from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal
 N = getenv("N", 2048)
--- a/extra/gemm/metal_matvec.py
+++ b/extra/gemm/metal_matvec.py
@@ -5,14 +5,14 @@ import time, torch, torch.mps
 from tinygrad.tensor import Tensor
 from tinygrad.jit import TinyJit
-from tinygrad import Device, GlobalCounters
+from tinygrad import Device, GlobalCounters, dtypes
 from tinygrad.helpers import colored, getenv, CI, flat_mv
 import os
 os.environ["METAL"] = "1"
 import time
 import numpy as np
-from tinygrad.helpers import dtypes, getenv
+from tinygrad.helpers import getenv
 from tinygrad.runtime.ops_metal import MetalAllocator, MetalDevice, MetalProgram, compile_metal
 N = 16384
--- a/extra/gemm/simple_matmul.py
+++ b/extra/gemm/simple_matmul.py
@@ -1,7 +1,6 @@
 import numpy as np
 from tinygrad.helpers import getenv
-from tinygrad.tensor import Tensor
+from tinygrad import dtypes, Tensor
 from tinygrad.helpers import dtypes
 dtype_in = dtypes.half if getenv("HALF") else dtypes.float
 N = getenv("N", 4096)
 CNT = getenv("CNT", 10)
--- a/extra/models/mask_rcnn.py
+++ b/extra/models/mask_rcnn.py
@@ -3,9 +3,8 @@ import math
 import os
 import numpy as np
 from pathlib import Path
-from tinygrad import nn
+from tinygrad import nn, Tensor, dtypes
-from tinygrad.tensor import Tensor
+from tinygrad.helpers import get_child, fetch
 from tinygrad.helpers import dtypes, get_child, fetch
 from tinygrad.nn.state import torch_load
 from extra.models.resnet import ResNet
 from extra.models.retinanet import nms as _box_nms
--- a/extra/onnx.py
+++ b/extra/onnx.py
@@ -2,8 +2,8 @@ from __future__ import annotations
 from google.protobuf.internal.containers import RepeatedCompositeFieldContainer
 import importlib
 import numpy as np
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor, dtypes
-from tinygrad.helpers import getenv, DEBUG, dtypes
+from tinygrad.helpers import getenv, DEBUG
 from typing import List, Dict
 from onnx import AttributeProto, ModelProto, TensorProto, TypeProto # onnx 1.50 uses serialized file (see onnx/onnx-ml.proto) as descriptors
 try:
--- a/extra/onnx_ops.py
+++ b/extra/onnx_ops.py
@@ -1,7 +1,8 @@
 import functools, io, math
 from typing import Union, Tuple, Optional, List, Any
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor, dtypes
-from tinygrad.helpers import prod, dtypes, ImageDType, flatten
+from tinygrad.dtype import ImageDType
 from tinygrad.helpers import prod, flatten
 from extra.onnx import safe_numpy
 from onnx.helper import tensor_dtype_to_np_dtype
 from onnx import TensorProto
--- a/extra/optimization/extract_policynet.py
+++ b/extra/optimization/extract_policynet.py
@@ -12,7 +12,7 @@ from tinygrad.helpers import getenv
 # stuff needed to unpack a kernel
 from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
-from tinygrad.helpers import dtypes
+from tinygrad.dtype import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 from tinygrad.shape.symbolic import Variable
--- a/extra/optimization/extract_sa_pairs.py
+++ b/extra/optimization/extract_sa_pairs.py
@@ -5,7 +5,7 @@ import numpy as np
 # stuff needed to unpack a kernel
 from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
-from tinygrad.helpers import dtypes
+from tinygrad.dtype import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 from tinygrad.shape.symbolic import Variable
--- a/extra/optimization/helpers.py
+++ b/extra/optimization/helpers.py
@@ -1,6 +1,6 @@
 # stuff needed to unpack a kernel
 from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
-from tinygrad.helpers import dtypes
+from tinygrad.dtype import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 from tinygrad.shape.symbolic import Variable
--- a/extra/optimization/pretrain_valuenet.py
+++ b/extra/optimization/pretrain_valuenet.py
@@ -9,7 +9,7 @@ from tinygrad.nn.state import get_parameters, get_state_dict, safe_save, safe_lo
 # stuff needed to unpack a kernel
 from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
-from tinygrad.helpers import dtypes
+from tinygrad.dtype import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 from tinygrad.shape.symbolic import Variable
--- a/openpilot/compile2.py
+++ b/openpilot/compile2.py
@@ -13,8 +13,9 @@ import onnx
 from tqdm import tqdm
 from typing import Tuple, List, Optional, Dict
 from extra.onnx import get_run_onnx
-from tinygrad import Tensor, Device, GlobalCounters
+from tinygrad import Tensor, Device, GlobalCounters, dtypes
-from tinygrad.helpers import dtypes, partition, Context, fetch, getenv, ImageDType, GRAPH, DEBUG
+from tinygrad.dtype import ImageDType
 from tinygrad.helpers import partition, Context, fetch, getenv, GRAPH, DEBUG
 from tinygrad.realize import run_schedule, lower_schedule_item
 from tinygrad.ops import LoadOps, ScheduleItem
 Device.DEFAULT = "GPU"
--- a/test/external/external_osx_profiling.py
+++ b/test/external/external_osx_profiling.py
@@ -1,5 +1,5 @@
 from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
-from tinygrad.helpers import dtypes
+from tinygrad import dtypes
 import time
 N = 1000000
--- a/test/external/external_test_jit_on_models.py
+++ b/test/external/external_test_jit_on_models.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 import unittest
 import numpy as np
-from tinygrad.tensor import Tensor
+from tinygrad import Tensor, dtypes
 from tinygrad.jit import TinyJit
-from tinygrad.helpers import dtypes, CI
+from tinygrad.helpers import CI
 from test.helpers import derandomize_model
 from examples.llama import Transformer
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -4,8 +4,8 @@ from tinygrad.tensor import Tensor
 from tinygrad.nn import optim
 from tinygrad.nn.state import get_parameters
 from tinygrad.jit import TinyJit
-from tinygrad import Device, GlobalCounters
+from tinygrad import Device, GlobalCounters, dtypes
-from tinygrad.helpers import CI, dtypes
+from tinygrad.helpers import CI
 from tinygrad.shape.symbolic import Variable
 from test.helpers import derandomize_model
--- a/test/test_assign.py
+++ b/test/test_assign.py
@@ -2,8 +2,7 @@
 import unittest
 import numpy as np
 from tinygrad.tensor import Tensor
-from tinygrad import Device
+from tinygrad import Device, dtypes
 from tinygrad.helpers import dtypes
 N = 200  # has to be bigger than the cache to fail
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@@ -4,7 +4,8 @@
 import unittest
 import numpy as np
 from typing import Optional, Tuple
-from tinygrad.helpers import prod, dtypes
+from tinygrad.helpers import prod
 from tinygrad.dtype import dtypes
 # *** first, we implement the atan2 op at the lowest level ***
 # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -2,7 +2,8 @@ import unittest
 import numpy as np
 import torch
 import operator
-from tinygrad.helpers import CI, DTYPES_DICT, getenv, DType, DEBUG, ImageDType, PtrDType, OSX, least_upper_float, temp, least_upper_dtype
+from tinygrad.helpers import CI, getenv, DEBUG, OSX, temp
 from tinygrad.dtype import DType, DTYPES_DICT, ImageDType, PtrDType, least_upper_float, least_upper_dtype
 from tinygrad import Device
 from tinygrad.tensor import Tensor, dtypes
 from typing import Any, List
--- a/test/test_dtype_alu.py
+++ b/test/test_dtype_alu.py
@@ -4,7 +4,8 @@ from tinygrad import Tensor, dtypes, Device
 import operator
 import numpy as np
 from hypothesis import given, strategies as st, settings
-from tinygrad.helpers import CI, getenv, DType, OSX
+from tinygrad.dtype import DType
 from tinygrad.helpers import CI, getenv, OSX
 from tinygrad.ops import UnaryOps, get_lazyop_info
 settings.register_profile("my_profile", max_examples=200, deadline=None)
--- a/test/test_image_dtype.py
+++ b/test/test_image_dtype.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
 from tinygrad import Device, dtypes, Tensor, Variable
-from tinygrad.helpers import ImageDType
+from tinygrad.dtype import ImageDType
 from tinygrad.features.image import to_image_idx
@unittest.skipIf(Device.DEFAULT != "GPU", "only images on GPU")
--- a/test/test_lazyop.py
+++ b/test/test_lazyop.py
@@ -5,7 +5,7 @@ from tinygrad.tensor import Tensor
 # ruff: noqa: F401
 from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
 from tinygrad.lazy import LazyBuffer
-from tinygrad.helpers import dtypes
+from tinygrad import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 from tinygrad.shape.symbolic import Variable
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -12,7 +12,8 @@ from tinygrad.shape.symbolic import MulNode, SumNode, Variable, NumNode, Node, c
 from tinygrad.tensor import Tensor
 from tinygrad.jit import CacheCollector
 from tinygrad.realize import run_schedule
-from tinygrad.helpers import dtypes, prod
+from tinygrad.helpers import prod
 from tinygrad.dtype import dtypes
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "linearizer is only for compiled backends")
 class TestLinearizer(unittest.TestCase):
--- a/test/test_linearizer_failures.py
+++ b/test/test_linearizer_failures.py
@@ -2,13 +2,12 @@
 import unittest
 from tinygrad.codegen.linearizer import Linearizer
 from tinygrad.features.search import Opt, OptOps
-from tinygrad import Device
+from tinygrad import Device, dtypes
 from tinygrad.helpers import OSX, CI
 from test.external.fuzz_linearizer import run_linearizer
 # stuff needed to unpack a kernel
 from tinygrad.ops import LazyOp, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer, get_lazyop_info
 from tinygrad.helpers import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
 inf, nan = float('inf'), float('nan')
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -5,8 +5,8 @@ import math
 import numpy as np
 import unittest
 from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, dtypes
+from tinygrad.helpers import getenv, IMAGE, DEBUG, CI
-from tinygrad import Device
+from tinygrad import Device, dtypes
 if CI:
  import warnings
--- a/test/test_randomness.py
+++ b/test/test_randomness.py
@@ -3,9 +3,7 @@ import math
 import unittest
 import numpy as np
 import torch
-from tinygrad.tensor import Tensor
+from tinygrad import nn, dtypes, Tensor
 import tinygrad.nn as nn
 from tinygrad.helpers import dtypes
 from functools import partial
 # https://gist.github.com/devries/11405101
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -7,10 +7,10 @@ from typing import List, Optional
 from tinygrad.tensor import Tensor
 from tinygrad.ops import LoadOps
 from tinygrad.device import Device, Compiled
-from tinygrad.helpers import DEBUG, dtypes
+from tinygrad.helpers import DEBUG
 from tinygrad.codegen.linearizer import Linearizer
 from tinygrad.graph import print_tree, realized_lazybuffer
-from tinygrad import nn
+from tinygrad import nn, dtypes
 def check_schedule(t:Tensor, allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_loadops=True):
  seen = set()
--- a/test/test_specific_conv.py
+++ b/test/test_specific_conv.py
@@ -1,7 +1,7 @@
 import unittest
 from tinygrad.tensor import Tensor
-from tinygrad.helpers import CI, dtypes
+from tinygrad.helpers import CI
-from tinygrad import Device
+from tinygrad import Device, dtypes
 # similar to test/external/external_test_gpu_ast.py, but universal
@unittest.skipIf(Device.DEFAULT == "CUDA" and CI, "slow on CUDA CI")
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -2,8 +2,8 @@ import numpy as np
 import torch
 import unittest, copy
 import mmap
-from tinygrad.tensor import Tensor, Device
+from tinygrad import Tensor, Device, dtypes
-from tinygrad.helpers import dtypes, temp
+from tinygrad.helpers import temp
 from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
 x_init = np.random.randn(1,3).astype(np.float32)
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -2,7 +2,8 @@
 from typing import Optional, Tuple, Any, List
 import unittest, math
 import numpy as np
-from tinygrad.helpers import dtypes, getenv, DType, PtrDType
+from tinygrad.dtype import dtypes, DType, PtrDType
 from tinygrad.helpers import getenv
 from tinygrad.device import Buffer, Device
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
 from tinygrad.device import CompiledASTRunner, Compiled
--- a/test/unit/test_flopcounter.py
+++ b/test/unit/test_flopcounter.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 import unittest
 from tinygrad import dtypes
 from tinygrad.ops import LazyOp, BinaryOps, ReduceOps, get_lazyop_info, BufferOps, MemBuffer
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.helpers import dtypes
 class TestFlopCounter(unittest.TestCase):
  def setUp(self):
--- a/test/unit/test_helpers.py
+++ b/test/unit/test_helpers.py
@@ -1,7 +1,8 @@
 import unittest
 import numpy as np
 from PIL import Image
-from tinygrad.helpers import Context, ContextVar, DType, dtypes, merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten
+from tinygrad.dtype import DType, dtypes
 from tinygrad.helpers import Context, ContextVar, merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten
 from tinygrad.shape.symbolic import Variable, NumNode
 VARIABLE = ContextVar("VARIABLE", 0)
--- a/tinygrad/init.py
+++ b/tinygrad/init.py
@@ -1,6 +1,6 @@
 from tinygrad.tensor import Tensor            # noqa: F401
 from tinygrad.jit import TinyJit              # noqa: F401
 from tinygrad.shape.symbolic import Variable  # noqa: F401
-from tinygrad.helpers import dtypes           # noqa: F401
+from tinygrad.dtype import dtypes             # noqa: F401
 from tinygrad.ops import GlobalCounters       # noqa: F401
 from tinygrad.device import Device            # noqa: F401
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -3,7 +3,8 @@ import os, math, itertools
 from typing import NamedTuple, Optional, List, Tuple, cast, Dict, Union
 from tinygrad.ops import LazyOp, FlopCounter, get_lazyop_info, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps, vars_from_ast
 from tinygrad.device import Device, Compiled
-from tinygrad.helpers import dedup, dtypes, colored, ImageDType, DType, ansilen, getenv, prod, DEBUG, round_up
+from tinygrad.dtype import dtypes, ImageDType, DType
 from tinygrad.helpers import dedup, colored, ansilen, getenv, prod, DEBUG, round_up
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
 from tinygrad.shape.symbolic import sint
 from tinygrad.shape.view import View, strides_for_shape
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -5,7 +5,8 @@ from collections import defaultdict
 from enum import Enum, auto
 from dataclasses import dataclass
-from tinygrad.helpers import colored, ImageDType, DEBUG, dtypes, DType, prod, PtrDType, getenv, all_same, to_function_name, flatten
+from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType
 from tinygrad.helpers import colored, DEBUG, prod, getenv, all_same, to_function_name, flatten
 from tinygrad.ops import LazyOp, UnaryOps, BinaryOps, TernaryOps, ReduceOps, ConstBuffer, MemBuffer, BufferOps, vars_from_ast, get_lazyop_info
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import Variable, NumNode, VariableOrNum, Node, SumNode, MulNode, DivNode, ModNode, LtNode, AndNode
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -3,8 +3,8 @@ import numpy as np
 from collections import defaultdict
 from typing import TYPE_CHECKING, Union, Any, List, Optional, Dict, Callable
 import importlib, inspect, functools, pathlib, time, re, ctypes
-from tinygrad.helpers import DType, dtypes, ImageDType, diskcache_get, diskcache_put
+from tinygrad.dtype import DType, dtypes, ImageDType
-from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_int, to_function_name, from_mv, flat_mv
+from tinygrad.helpers import ansilen, DEBUG, getenv, colored, BEAM, NOOPT, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put
 from tinygrad.shape.symbolic import Variable, sym_infer, sint
 from tinygrad.ops import LazyOp, TernaryOps, get_lazyop_info, ReduceOps, BufferOps, BinaryOps, UnaryOps, Op, vars_from_ast, GlobalCounters
--- a/tinygrad/dtype.py
+++ b/tinygrad/dtype.py
@@ -0,0 +1,99 @@
 from typing import NamedTuple, Final, Optional, ClassVar, Set, Tuple, Dict
 import numpy as np  # TODO: remove numpy
 import functools
 # TODO: migrate this from NamedTuple -> dataclass
 class DType(NamedTuple):
  priority: int  # this determines when things get upcasted
  itemsize: int
  name: str
  np: Optional[type]  # TODO: someday this will be removed with the "remove numpy" project
  sz: int = 1
  def __repr__(self): return f"dtypes.{INVERSE_DTYPES_DICT[self]}" if self.sz == 1 else f"dtypes._{INVERSE_DTYPES_DICT[self.scalar()]}{self.sz}"
  def vec(self, sz:int):
    assert sz > 1 and self.sz == 1, f"can't vectorize {self} with size {sz}"
    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self]}{str(sz)}", None, sz)
  def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.sz))]] if self.sz > 1 else self
 # dependent typing?
 class ImageDType(DType):
  def __new__(cls, priority, itemsize, name, np, shape, base):
    return super().__new__(cls, priority, itemsize, name, np)
  def __init__(self, priority, itemsize, name, np, shape, base):
    self.shape: Tuple[int, ...] = shape  # arbitrary arg for the dtype, used in image for the shape
    self.base: DType = base
    super().__init__()
  def scalar(self): return self.base
  def vec(self, sz:int): return self.base.vec(sz)
  def __repr__(self): return f"dtypes.{self.name}({self.shape})"
  # TODO: fix this to not need these
  def __hash__(self): return hash((super().__hash__(), self.shape))
  def __eq__(self, x): return super().__eq__(x) and self.shape == x.shape
  def __ne__(self, x): return super().__ne__(x) or self.shape != x.shape
 class PtrDType(DType):
  def __new__(cls, dt:DType): return super().__new__(cls, dt.priority, dt.itemsize, dt.name, dt.np, dt.sz)
  def __repr__(self): return f"ptr.{super().__repr__()}"
 class dtypes:
  @staticmethod
  def is_float(x: DType) -> bool: return x.scalar() in (dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64)
  @staticmethod # static methds on top, or bool in the type info will refer to dtypes.bool
  def is_int(x: DType) -> bool: return x.scalar() in (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64) or dtypes.is_unsigned(x)
  @staticmethod
  def is_unsigned(x: DType) -> bool: return x.scalar() in (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
  @staticmethod
  def from_np(x) -> DType: return DTYPES_DICT[np.dtype(x).name]
  @staticmethod  # NOTE: isinstance(True, int) is True in python
  def from_py(x) -> DType: return dtypes.default_float if isinstance(x, float) else dtypes.bool if isinstance(x, bool) else dtypes.default_int
  @staticmethod
  def fields() -> Dict[str, DType]: return DTYPES_DICT
  bool: Final[DType] = DType(0, 1, "bool", np.bool_)
  int8: Final[DType] = DType(1, 1, "char", np.int8)
  uint8: Final[DType] = DType(2, 1, "unsigned char", np.uint8)
  int16: Final[DType] = DType(3, 2, "short", np.int16)
  uint16: Final[DType] = DType(4, 2, "unsigned short", np.uint16)
  int32: Final[DType] = DType(5, 4, "int", np.int32)
  uint32: Final[DType] = DType(6, 4, "unsigned int", np.uint32)
  int64: Final[DType] = DType(7, 8, "long", np.int64)
  uint64: Final[DType] = DType(8, 8, "unsigned long", np.uint64)
  float16: Final[DType] = DType(9, 2, "half", np.float16)
  # bfloat16 has higher priority than float16, so least_upper_dtype(dtypes.int64, dtypes.uint64) = dtypes.float16
  bfloat16: Final[DType] = DType(10, 2, "__bf16", None)
  float32: Final[DType] = DType(11, 4, "float", np.float32)
  float64: Final[DType] = DType(12, 8, "double", np.float64)
  # dtype aliases
  half = float16; float = float32; double = float64 # noqa: E702
  uchar = uint8; ushort = uint16; uint = uint32; ulong = uint64 # noqa: E702
  char = int8; short = int16; int = int32; long = int64 # noqa: E702
  # NOTE: these are image dtypes
  @staticmethod
  def imageh(shp): return ImageDType(100, 2, "imageh", np.float16, shp, dtypes.float32)
  @staticmethod
  def imagef(shp): return ImageDType(100, 4, "imagef", np.float32, shp, dtypes.float32)
  default_float: ClassVar[DType] = float32
  default_int: ClassVar[DType] = int32
 # https://jax.readthedocs.io/en/latest/jep/9407-type-promotion.html
 # we don't support weak type and complex type
 promo_lattice = { dtypes.bool: [dtypes.int8, dtypes.uint8],
  dtypes.int8: [dtypes.int16], dtypes.int16: [dtypes.int32], dtypes.int32: [dtypes.int64], dtypes.int64: [dtypes.float16, dtypes.bfloat16],
  dtypes.uint8: [dtypes.int16, dtypes.uint16], dtypes.uint16: [dtypes.int32, dtypes.uint32],
  dtypes.uint32: [dtypes.int64, dtypes.uint64], dtypes.uint64: [dtypes.float16, dtypes.bfloat16],
  dtypes.float16: [dtypes.float32], dtypes.bfloat16: [dtypes.float32], dtypes.float32: [dtypes.float64], }
@functools.lru_cache(None)
 def _get_recursive_parents(dtype:DType) -> Set[DType]:
  return set.union(*[_get_recursive_parents(d) for d in promo_lattice[dtype]], {dtype}) if dtype != dtypes.float64 else {dtypes.float64}
@functools.lru_cache(None)
 def least_upper_dtype(*ds:DType) -> DType:
  return min(set.intersection(*[_get_recursive_parents(d) for d in ds])) if not (images:=[d for d in ds if isinstance(d, ImageDType)]) else images[0]
 def least_upper_float(dt:DType) -> DType: return dt if dtypes.is_float(dt) else least_upper_dtype(dt, dtypes.float32)
 # HACK: staticmethods are not callable in 3.8 so we have to compare the class
 DTYPES_DICT = {k: v for k, v in dtypes.__dict__.items() if (
  not k.startswith('__') and not k.startswith('default') and not callable(v) and v.__class__ is not staticmethod)}
 INVERSE_DTYPES_DICT = {v:k for k,v in DTYPES_DICT.items()}
--- a/tinygrad/features/image.py
+++ b/tinygrad/features/image.py
@@ -1,5 +1,6 @@
 from typing import Tuple
-from tinygrad.helpers import prod, IMAGE, getenv, dtypes, DEBUG
+from tinygrad.helpers import prod, IMAGE, getenv, DEBUG
 from tinygrad.dtype import dtypes
 # *** image Tensor function replacements ***
--- a/tinygrad/features/search.py
+++ b/tinygrad/features/search.py
@@ -2,7 +2,8 @@ from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
 import itertools, random, math, time, multiprocessing, traceback, signal
 from tinygrad.device import Device, Compiled, Buffer
 from tinygrad.ops import MemBuffer, vars_from_ast
-from tinygrad.helpers import prod, ImageDType, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
+from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
 from tinygrad.dtype import ImageDType
 from tinygrad.codegen.linearizer import Linearizer
 from tinygrad.shape.symbolic import sym_infer
 from collections import defaultdict
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 import os, functools, platform, time, re, contextlib, operator, hashlib, pickle, sqlite3, cProfile, pstats, tempfile, pathlib, string, ctypes
-import numpy as np
+from urllib import request  # NOTE: this has to be imported specifically
 from urllib import request
 from tqdm import tqdm
-from typing import Dict, Tuple, Union, List, NamedTuple, Final, ClassVar, Optional, Iterable, Any, TypeVar, TYPE_CHECKING, Callable, Set, Sequence
+from typing import Dict, Tuple, Union, List, ClassVar, Optional, Iterable, Any, TypeVar, TYPE_CHECKING, Callable, Sequence
 if TYPE_CHECKING:  # TODO: remove this and import TypeGuard from typing once minimum python supported version is 3.10
  from typing_extensions import TypeGuard
@@ -101,104 +100,6 @@ class Profiling(contextlib.ContextDecorator):
      self.pr.disable()
      pstats.Stats(self.pr).strip_dirs().sort_stats(self.sort).print_stats(self.frac)
 # **** tinygrad now supports dtypes! *****
 # TODO: migrate this from NamedTuple -> dataclass
 class DType(NamedTuple):
  priority: int  # this determines when things get upcasted
  itemsize: int
  name: str
  np: Optional[type]  # TODO: someday this will be removed with the "remove numpy" project
  sz: int = 1
  def __repr__(self): return f"dtypes.{INVERSE_DTYPES_DICT[self]}" if self.sz == 1 else f"dtypes._{INVERSE_DTYPES_DICT[self.scalar()]}{self.sz}"
  def vec(self, sz:int):
    assert sz > 1 and self.sz == 1, f"can't vectorize {self} with size {sz}"
    return DType(self.priority, self.itemsize*sz, f"{INVERSE_DTYPES_DICT[self]}{str(sz)}", None, sz)
  def scalar(self): return DTYPES_DICT[self.name[:-len(str(self.sz))]] if self.sz > 1 else self
 # dependent typing?
 class ImageDType(DType):
  def __new__(cls, priority, itemsize, name, np, shape, base):
    return super().__new__(cls, priority, itemsize, name, np)
  def __init__(self, priority, itemsize, name, np, shape, base):
    self.shape: Tuple[int, ...] = shape  # arbitrary arg for the dtype, used in image for the shape
    self.base: DType = base
    super().__init__()
  def scalar(self): return self.base
  def vec(self, sz:int): return self.base.vec(sz)
  def __repr__(self): return f"dtypes.{self.name}({self.shape})"
  # TODO: fix this to not need these
  def __hash__(self): return hash((super().__hash__(), self.shape))
  def __eq__(self, x): return super().__eq__(x) and self.shape == x.shape
  def __ne__(self, x): return super().__ne__(x) or self.shape != x.shape
 class PtrDType(DType):
  def __new__(cls, dt:DType): return super().__new__(cls, dt.priority, dt.itemsize, dt.name, dt.np, dt.sz)
  def __repr__(self): return f"ptr.{super().__repr__()}"
 class dtypes:
  @staticmethod
  def is_float(x: DType) -> bool: return x.scalar() in (dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64)
  @staticmethod # static methds on top, or bool in the type info will refer to dtypes.bool
  def is_int(x: DType) -> bool: return x.scalar() in (dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64) or dtypes.is_unsigned(x)
  @staticmethod
  def is_unsigned(x: DType) -> bool: return x.scalar() in (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
  @staticmethod
  def from_np(x) -> DType: return DTYPES_DICT[np.dtype(x).name]
  @staticmethod  # NOTE: isinstance(True, int) is True in python
  def from_py(x) -> DType: return dtypes.default_float if isinstance(x, float) else dtypes.bool if isinstance(x, bool) else dtypes.default_int
  @staticmethod
  def fields() -> Dict[str, DType]: return DTYPES_DICT
  bool: Final[DType] = DType(0, 1, "bool", np.bool_)
  int8: Final[DType] = DType(1, 1, "char", np.int8)
  uint8: Final[DType] = DType(2, 1, "unsigned char", np.uint8)
  int16: Final[DType] = DType(3, 2, "short", np.int16)
  uint16: Final[DType] = DType(4, 2, "unsigned short", np.uint16)
  int32: Final[DType] = DType(5, 4, "int", np.int32)
  uint32: Final[DType] = DType(6, 4, "unsigned int", np.uint32)
  int64: Final[DType] = DType(7, 8, "long", np.int64)
  uint64: Final[DType] = DType(8, 8, "unsigned long", np.uint64)
  float16: Final[DType] = DType(9, 2, "half", np.float16)
  # bfloat16 has higher priority than float16, so least_upper_dtype(dtypes.int64, dtypes.uint64) = dtypes.float16
  bfloat16: Final[DType] = DType(10, 2, "__bf16", None)
  float32: Final[DType] = DType(11, 4, "float", np.float32)
  float64: Final[DType] = DType(12, 8, "double", np.float64)
  # dtype aliases
  half = float16; float = float32; double = float64 # noqa: E702
  uchar = uint8; ushort = uint16; uint = uint32; ulong = uint64 # noqa: E702
  char = int8; short = int16; int = int32; long = int64 # noqa: E702
  # NOTE: these are image dtypes
  @staticmethod
  def imageh(shp): return ImageDType(100, 2, "imageh", np.float16, shp, dtypes.float32)
  @staticmethod
  def imagef(shp): return ImageDType(100, 4, "imagef", np.float32, shp, dtypes.float32)
  default_float: ClassVar[DType] = float32
  default_int: ClassVar[DType] = int32
 # https://jax.readthedocs.io/en/latest/jep/9407-type-promotion.html
 # we don't support weak type and complex type
 promo_lattice = { dtypes.bool: [dtypes.int8, dtypes.uint8],
  dtypes.int8: [dtypes.int16], dtypes.int16: [dtypes.int32], dtypes.int32: [dtypes.int64], dtypes.int64: [dtypes.float16, dtypes.bfloat16],
  dtypes.uint8: [dtypes.int16, dtypes.uint16], dtypes.uint16: [dtypes.int32, dtypes.uint32],
  dtypes.uint32: [dtypes.int64, dtypes.uint64], dtypes.uint64: [dtypes.float16, dtypes.bfloat16],
  dtypes.float16: [dtypes.float32], dtypes.bfloat16: [dtypes.float32], dtypes.float32: [dtypes.float64], }
@functools.lru_cache(None)
 def _get_recursive_parents(dtype:DType) -> Set[DType]:
  return set.union(*[_get_recursive_parents(d) for d in promo_lattice[dtype]], {dtype}) if dtype != dtypes.float64 else {dtypes.float64}
@functools.lru_cache(None)
 def least_upper_dtype(*ds:DType) -> DType:
  return min(set.intersection(*[_get_recursive_parents(d) for d in ds])) if not (images:=[d for d in ds if isinstance(d, ImageDType)]) else images[0]
 def least_upper_float(dt:DType) -> DType: return dt if dtypes.is_float(dt) else least_upper_dtype(dt, dtypes.float32)
 # HACK: staticmethods are not callable in 3.8 so we have to compare the class
 DTYPES_DICT = {k: v for k, v in dtypes.__dict__.items() if (
  not k.startswith('__') and not k.startswith('default') and not callable(v) and v.__class__ is not staticmethod)}
 INVERSE_DTYPES_DICT = {v:k for k,v in DTYPES_DICT.items()}
 # *** universal database cache ***
 _cache_dir: str = getenv("XDG_CACHE_HOME", os.path.expanduser("~/Library/Caches" if OSX else "~/.cache"))
--- a/tinygrad/jit.py
+++ b/tinygrad/jit.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 from typing import Callable, List, Tuple, Dict, cast, Union, Optional, TypeVar, Generic
 import functools, itertools, operator
-from tinygrad.helpers import DEBUG, DType, merge_dicts, getenv, all_int, Context, GRAPH
+from tinygrad.dtype import DType
 from tinygrad.helpers import DEBUG, merge_dicts, getenv, all_int, Context, GRAPH
 from tinygrad.device import Device, JITRunner, CompiledASTRunner, Buffer
 from tinygrad.tensor import Tensor
 from tinygrad.shape.shapetracker import ShapeTracker
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -2,7 +2,8 @@ from __future__ import annotations
 import sys, math
 import numpy as np
 from typing import Union, Optional, Any, Tuple, List, Set, Dict
-from tinygrad.helpers import prod, dtypes, DType, merge_dicts, flatten, getenv, dedup, ImageDType, DEBUG, all_int, all_same
+from tinygrad.dtype import dtypes, DType, ImageDType
 from tinygrad.helpers import prod, merge_dicts, flatten, getenv, dedup, DEBUG, all_int, all_same
 from tinygrad.ops import LoadOps, UnaryOps, BinaryOps, TernaryOps, ReduceOps, BufferOps
 from tinygrad.ops import Op, LazyOp, ConstBuffer, MemBuffer, ScheduleItem, vars_from_ast
 from tinygrad.shape.symbolic import sint, Variable
--- a/tinygrad/mlops.py
+++ b/tinygrad/mlops.py
@@ -1,6 +1,7 @@
 import math
 from typing import Tuple, Optional
-from tinygrad.helpers import argsort, DType
+from tinygrad.helpers import argsort
 from tinygrad.dtype import DType
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ReduceOps
 from tinygrad.tensor import Function
 from tinygrad.lazy import LazyBuffer
--- a/tinygrad/nn/state.py
+++ b/tinygrad/nn/state.py
@@ -3,7 +3,8 @@ from tqdm import tqdm
 from typing import Dict, Union, List, Optional, Any, Tuple
 from tinygrad.tensor import Tensor
 from tinygrad.ops import GlobalCounters
-from tinygrad.helpers import dtypes, prod, argsort, DEBUG, Timing, CI, unwrap
+from tinygrad.dtype import dtypes
 from tinygrad.helpers import prod, argsort, DEBUG, Timing, CI, unwrap
 from tinygrad.shape.view import strides_for_shape
 safe_dtypes = {"F16": dtypes.float16, "F32": dtypes.float32, "U8": dtypes.uint8, "I8": dtypes.int8, "I32": dtypes.int32, "I64": dtypes.int64,
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -2,7 +2,8 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Union, Type, Tuple, Any, List, Dict, Callable, ClassVar
 import functools
 from enum import Enum, auto
-from tinygrad.helpers import dtypes, prod, DType, dedup
+from tinygrad.helpers import prod, dedup
 from tinygrad.dtype import dtypes, DType
 from tinygrad.shape.symbolic import Variable
 from dataclasses import dataclass
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -3,7 +3,8 @@ import math, functools
 from collections import defaultdict, Counter
 from tinygrad.codegen.linearizer import UOps, UOp
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
-from tinygrad.helpers import ImageDType, dtypes, prod, DType, PtrDType, strip_parens, getenv
+from tinygrad.helpers import prod, strip_parens, getenv
 from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType
 class CStyleLanguage(NamedTuple):
  size_prefix: str = "int"
--- a/tinygrad/renderer/llvmir.py
+++ b/tinygrad/renderer/llvmir.py
@@ -1,7 +1,7 @@
 from typing import Final, Dict, Callable, Any, List, Optional, Tuple
 from llvmlite import ir
 from tinygrad.codegen.linearizer import UOps, UOp
-from tinygrad.helpers import DType, PtrDType, dtypes
+from tinygrad.dtype import DType, PtrDType, dtypes
 from tinygrad.ops import Op, UnaryOps, BinaryOps, TernaryOps
 MFLAGS = ('nsz', 'arcp', 'contract', 'afn', 'reassoc') # All from fast math, but nnan and ninf
--- a/tinygrad/runtime/graph/metal.py
+++ b/tinygrad/runtime/graph/metal.py
@@ -1,7 +1,8 @@
 from typing import List, Any, Dict, cast, Optional
 import numpy as np
 import Metal
-from tinygrad.helpers import dtypes, dedup, unwrap2
+from tinygrad.dtype import dtypes
 from tinygrad.helpers import dedup, unwrap2
 from tinygrad.device import Buffer, CompiledASTRunner, update_stats
 from tinygrad.jit import JitItem, get_input_replace, get_jit_stats, get_jc_idxs_with_updatable_launch_dims, GraphException
 from tinygrad.shape.symbolic import Variable
--- a/tinygrad/runtime/ops_disk.py
+++ b/tinygrad/runtime/ops_disk.py
@@ -1,6 +1,7 @@
 import os, mmap, _posixshmem
 from typing import Callable, Dict, Tuple
-from tinygrad.helpers import prod, DType, OSX, dtypes
+from tinygrad.dtype import DType, dtypes
 from tinygrad.helpers import prod, OSX
 from tinygrad.device import Interpreted, Allocator
 from tinygrad.ops import Op, MovementOps, UnaryOps
 from tinygrad.shape.view import strides_for_shape
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -2,7 +2,8 @@ from __future__ import annotations
 from typing import Tuple, Optional, List
 import ctypes, functools
 import gpuctypes.opencl as cl
-from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, ImageDType, DEBUG
+from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
 from tinygrad.dtype import ImageDType
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import OpenCLRenderer
 from tinygrad.device import Compiled, LRUAllocator
--- a/tinygrad/runtime/ops_torch.py
+++ b/tinygrad/runtime/ops_torch.py
@@ -3,7 +3,8 @@ import numpy as np
 from typing import Dict, Callable
 from tinygrad.ops import BufferOps, UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, Op
 from tinygrad.device import Interpreted, Allocator
-from tinygrad.helpers import getenv, dtypes, flatten
+from tinygrad.dtype import dtypes
 from tinygrad.helpers import getenv, flatten
 from tinygrad.runtime.ops_cpu import einsum_mulacc, reduce_axis
 device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -7,7 +7,7 @@ from functools import partialmethod, reduce
 from itertools import accumulate
 import numpy as np
-from tinygrad.helpers import DType, dtypes, ImageDType, least_upper_float, least_upper_dtype
+from tinygrad.dtype import DType, dtypes, ImageDType, least_upper_float, least_upper_dtype
 from tinygrad.helpers import argfix, make_pair, getenv, IMAGE, DEBUG, flatten, prod, all_int, round_up, merge_dicts, fully_flatten
 from tinygrad.lazy import LazyBuffer, create_schedule
 from tinygrad.ops import LoadOps