cleanup stale examples/extra (#13764)

* cleanup stale files * examples * move those back * old * delete more
2026-01-10 07:28:15 -05:00 · 2025-12-19 16:27:37 -04:00
parent 80b84f5267
commit df6cde8a00
45 changed files with 0 additions and 5039 deletions
--- a/examples/coder.py
+++ b/examples/coder.py
@@ -1,93 +0,0 @@
-#!/usr/bin/env python3
-import os, sys, traceback
-sys.path.append(os.getcwd())
-
-from io import StringIO
-from contextlib import redirect_stdout
-from tinygrad import Tensor, nn
-from tinygrad.helpers import Timing, colored, getenv, fetch
-from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
-from sentencepiece import SentencePieceProcessor
-
-def create_fixed_tokenizer(output_file):
-  print("creating fixed tokenizer")
-  import extra.junk.sentencepiece_model_pb2 as spb2
-  mp = spb2.ModelProto()
-  mp.ParseFromString(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/tokenizer.model?download=true").read_bytes())
-  mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
-  mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
-  with open(output_file, "wb") as f:
-    f.write(mp.SerializeToString())
-
-# example:
-# echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py
-
-if __name__ == "__main__":
-  # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
-  with Timing("create model: "):
-    model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1))
-
-  with Timing("download weights: "):
-    part1 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00001-of-00002.bin?download=true"))
-    part2 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00002-of-00002.bin?download=true"))
-
-  with Timing("weights -> model: "):
-    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part1, 32, 32, 8)), strict=False)
-    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part2, 32, 32, 8)), strict=False)
-
-  if not os.path.isfile("/tmp/tokenizer.model"): create_fixed_tokenizer("/tmp/tokenizer.model")
-  spp = SentencePieceProcessor(model_file="/tmp/tokenizer.model")
-
-  # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/tokenizer_config.json
-  #   "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-  IM_END = 32000
-  IM_START = 32001
-  def encode_prompt(k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
-  def start_prompt(k): return [IM_START]+spp.encode(f"{k}\n")
-  def output(outputted, toks, color):
-    cur = spp.decode(toks)[len(outputted):]
-    sys.stdout.write(colored(cur, color))
-    sys.stdout.flush()
-    outputted += cur
-    return outputted
-
-  # *** app below this line ***
-
-  toks = [spp.bos_id()] + encode_prompt("system", "You are Quentin. Quentin is a useful assistant who writes Python code to answer questions. He keeps the code as short as possible and doesn't read from user input")
-
-  PROMPT = getenv("PROMPT", 1)
-  temperature = getenv("TEMP", 0.7)
-
-  start_pos = 0
-  outputted = output("", toks, "green")
-  turn = True
-  while 1:
-    if PROMPT:
-      toks += encode_prompt("user", input("Q: ")) + start_prompt("assistant")
-    else:
-      toks += start_prompt("user" if turn else "assistant")
-      turn = not turn
-    old_output_len = len(outputted)
-    while 1:
-      tok = model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
-      start_pos = len(toks)
-      toks.append(tok)
-      outputted = output(outputted, toks, "blue" if not turn else "cyan")
-      if tok == IM_END: break
-      if tok == spp.eos_id(): break
-      new_output = outputted[old_output_len:]
-
-      if new_output.endswith("```") and '```python\n' in new_output:
-        python_code = new_output.split('```python\n')[1].split("```")[0]
-        # AI safety. Warning to user. Do not press y if the AI is trying to do unsafe things.
-        if input(colored(f" <-- PYTHON DETECTED, RUN IT? ", "red")).lower() == 'y':
-          my_stdout = StringIO()
-          try:
-            with redirect_stdout(my_stdout): exec(python_code)
-            result = my_stdout.getvalue()
-          except Exception as e:
-            result = ''.join(traceback.format_exception_only(e))
-          toks += spp.encode(f"\nOutput:\n```\n{result}```")
-          outputted = output(outputted, toks, "yellow")
-          old_output_len = len(outputted)
-    print("")
--- a/examples/efficientnet.py
+++ b/examples/efficientnet.py
@@ -1,89 +0,0 @@
-# load weights from
-# https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth
-# a rough copy of
-# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
-import sys
-import ast
-import time
-import numpy as np
-from PIL import Image
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv, fetch, Timing
-from tinygrad.engine.jit import TinyJit
-from extra.models.efficientnet import EfficientNet
-np.set_printoptions(suppress=True)
-
-# TODO: you should be able to put these in the jitted function
-bias = Tensor([0.485, 0.456, 0.406])
-scale = Tensor([0.229, 0.224, 0.225])
-
-@TinyJit
-def _infer(model, img):
-  img = img.permute((2,0,1))
-  img = img / 255.0
-  img = img - bias.reshape((1,-1,1,1))
-  img = img / scale.reshape((1,-1,1,1))
-  return model.forward(img).realize()
-
-def infer(model, img):
-  # preprocess image
-  aspect_ratio = img.size[0] / img.size[1]
-  img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
-
-  img = np.array(img)
-  y0,x0=(np.asarray(img.shape)[:2]-224)//2
-  retimg = img = img[y0:y0+224, x0:x0+224]
-
-  # if you want to look at the image
-  """
-  import matplotlib.pyplot as plt
-  plt.imshow(img)
-  plt.show()
-  """
-
-  # run the net
-  out = _infer(model, Tensor(img.astype("float32"))).numpy()
-
-  # if you want to look at the outputs
-  """
-  import matplotlib.pyplot as plt
-  plt.plot(out[0])
-  plt.show()
-  """
-  return out, retimg
-
-if __name__ == "__main__":
-  # instantiate my net
-  model = EfficientNet(getenv("NUM", 0))
-  model.load_from_pretrained()
-
-  # category labels
-  lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
-
-  # load image and preprocess
-  url = sys.argv[1] if len(sys.argv) >= 2 else "https://raw.githubusercontent.com/tinygrad/tinygrad/master/docs/showcase/stable_diffusion_by_tinygrad.jpg"
-  if url == 'webcam':
-    import cv2
-    cap = cv2.VideoCapture(0)
-    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
-    while 1:
-      _ = cap.grab() # discard one frame to circumvent capture buffering
-      ret, frame = cap.read()
-      img = Image.fromarray(frame[:, :, [2,1,0]])
-      lt = time.monotonic_ns()
-      out, retimg = infer(model, img)
-      print(f"{(time.monotonic_ns()-lt)*1e-6:7.2f} ms", np.argmax(out), np.max(out), lbls[np.argmax(out)])
-      SCALE = 3
-      simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
-      retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
-      cv2.imshow('capture', retimg)
-      if cv2.waitKey(1) & 0xFF == ord('q'):
-        break
-    cap.release()
-    cv2.destroyAllWindows()
-  else:
-    img = Image.open(fetch(url))
-    for i in range(getenv("CNT", 1)):
-      with Timing("did inference in "):
-        out, _ = infer(model, img)
-        print(np.argmax(out), np.max(out), lbls[np.argmax(out)])
--- a/examples/flux1.py
+++ b/examples/flux1.py
@@ -1,498 +0,0 @@
-# pip3 install sentencepiece
-
-# This file incorporates code from the following:
-# Github Name                    | License | Link
-# black-forest-labs/flux         | Apache  | https://github.com/black-forest-labs/flux/tree/main/model_licenses
-
-from tinygrad import Tensor, nn, dtypes, TinyJit
-from tinygrad.nn.state import safe_load, load_state_dict
-from tinygrad.helpers import fetch, tqdm, colored
-from sdxl import FirstStage
-from extra.models.clip import FrozenClosedClipEmbedder
-from extra.models.t5 import T5Embedder
-import numpy as np
-
-import math, time, argparse, tempfile
-from typing import List, Dict, Optional, Union, Tuple, Callable
-from dataclasses import dataclass
-from pathlib import Path
-from PIL import Image
-
-urls:dict = {
-  "flux-schnell": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/flux1-schnell.safetensors",
-  "flux-dev": "https://huggingface.co/camenduru/FLUX.1-dev/resolve/main/flux1-dev.sft",
-  "ae": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/ae.safetensors",
-  "T5_1_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00001-of-00002.safetensors",
-  "T5_2_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00002-of-00002.safetensors",
-  "T5_tokenizer": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/tokenizer_2/spiece.model",
-  "clip": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder/model.safetensors"
-}
-
-def tensor_identity(x:Tensor) -> Tensor: return x
-
-class AutoEncoder:
-  def __init__(self, scale_factor:float, shift_factor:float):
-    self.decoder = FirstStage.Decoder(128, 3, 3, 16, [1, 2, 4, 4], 2, 256)
-    self.scale_factor = scale_factor
-    self.shift_factor = shift_factor
-
-  def decode(self, z:Tensor) -> Tensor:
-    z = z / self.scale_factor + self.shift_factor
-    return self.decoder(z)
-
-# Conditioner
-class ClipEmbedder(FrozenClosedClipEmbedder):
-  def __call__(self, texts:Union[str, List[str], Tensor]) -> Tensor:
-    if isinstance(texts, str): texts = [texts]
-    assert isinstance(texts, (list,tuple)), f"expected list of strings, got {type(texts).__name__}"
-    tokens = Tensor.cat(*[Tensor(self.tokenizer.encode(text)) for text in texts], dim=0)
-    return self.transformer.text_model(tokens.reshape(len(texts),-1))[:, tokens.argmax(-1)]
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
-def attention(q:Tensor, k:Tensor, v:Tensor, pe:Tensor) -> Tensor:
-  q, k = apply_rope(q, k, pe)
-  x = Tensor.scaled_dot_product_attention(q, k, v)
-  return x.rearrange("B H L D -> B L (H D)")
-
-def rope(pos:Tensor, dim:int, theta:int) -> Tensor:
-  assert dim % 2 == 0
-  scale = Tensor.arange(0, dim, 2, dtype=dtypes.float32, device=pos.device) / dim # NOTE: this is torch.float64 in reference implementation
-  omega = 1.0 / (theta**scale)
-  out = Tensor.einsum("...n,d->...nd", pos, omega)
-  out = Tensor.stack(Tensor.cos(out), -Tensor.sin(out), Tensor.sin(out), Tensor.cos(out), dim=-1)
-  out = out.rearrange("b n d (i j) -> b n d i j", i=2, j=2)
-  return out.float()
-
-def apply_rope(xq:Tensor, xk:Tensor, freqs_cis:Tensor) -> Tuple[Tensor, Tensor]:
-  xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-  xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-  xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-  xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-  return xq_out.reshape(*xq.shape).cast(xq.dtype), xk_out.reshape(*xk.shape).cast(xk.dtype)
-
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
-class EmbedND:
-  def __init__(self, dim:int, theta:int, axes_dim:List[int]):
-    self.dim = dim
-    self.theta = theta
-    self.axes_dim = axes_dim
-
-  def __call__(self, ids:Tensor) -> Tensor:
-    n_axes = ids.shape[-1]
-    emb = Tensor.cat(*[rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3)
-    return emb.unsqueeze(1)
-
-class MLPEmbedder:
-  def __init__(self, in_dim:int, hidden_dim:int):
-    self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
-    self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
-
-  def __call__(self, x:Tensor) -> Tensor:
-    return self.out_layer(self.in_layer(x).silu())
-
-class QKNorm:
-  def __init__(self, dim:int):
-    self.query_norm = nn.RMSNorm(dim)
-    self.key_norm = nn.RMSNorm(dim)
-
-  def __call__(self, q:Tensor, k:Tensor) -> Tuple[Tensor, Tensor]:
-    return self.query_norm(q), self.key_norm(k)
-
-class SelfAttention:
-  def __init__(self, dim:int, num_heads:int = 8, qkv_bias:bool = False):
-    self.num_heads = num_heads
-    head_dim = dim // num_heads
-
-    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-    self.norm = QKNorm(head_dim)
-    self.proj = nn.Linear(dim, dim)
-
-  def __call__(self, x:Tensor, pe:Tensor) -> Tensor:
-    qkv = self.qkv(x)
-    q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    q, k = self.norm(q, k)
-    x = attention(q, k, v, pe=pe)
-    return self.proj(x)
-
-@dataclass
-class ModulationOut:
-  shift:Tensor
-  scale:Tensor
-  gate:Tensor
-
-class Modulation:
-  def __init__(self, dim:int, double:bool):
-    self.is_double = double
-    self.multiplier = 6 if double else 3
-    self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
-
-  def __call__(self, vec:Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
-    out = self.lin(vec.silu())[:, None, :].chunk(self.multiplier, dim=-1)
-    return ModulationOut(*out[:3]), ModulationOut(*out[3:]) if self.is_double else None
-
-class DoubleStreamBlock:
-  def __init__(self, hidden_size:int, num_heads:int, mlp_ratio:float, qkv_bias:bool = False):
-    mlp_hidden_dim = int(hidden_size * mlp_ratio)
-    self.num_heads = num_heads
-    self.hidden_size = hidden_size
-    self.img_mod = Modulation(hidden_size, double=True)
-    self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-
-    self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.img_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
-
-    self.txt_mod = Modulation(hidden_size, double=True)
-    self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-
-    self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.txt_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
-
-  def __call__(self, img:Tensor, txt:Tensor, vec:Tensor, pe:Tensor) -> tuple[Tensor, Tensor]:
-    img_mod1, img_mod2 = self.img_mod(vec)
-    txt_mod1, txt_mod2 = self.txt_mod(vec)
-    assert img_mod2 is not None and txt_mod2 is not None
-    # prepare image for attention
-    img_modulated = self.img_norm1(img)
-    img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-    img_qkv = self.img_attn.qkv(img_modulated)
-    img_q, img_k, img_v = img_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    img_q, img_k = self.img_attn.norm(img_q, img_k)
-
-    # prepare txt for attention
-    txt_modulated = self.txt_norm1(txt)
-    txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-    txt_qkv = self.txt_attn.qkv(txt_modulated)
-    txt_q, txt_k, txt_v = txt_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k)
-
-    # run actual attention
-    q = Tensor.cat(txt_q, img_q, dim=2)
-    k = Tensor.cat(txt_k, img_k, dim=2)
-    v = Tensor.cat(txt_v, img_v, dim=2)
-
-    attn = attention(q, k, v, pe=pe)
-    txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-
-    # calculate the img bloks
-    img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-    img = img + img_mod2.gate * ((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift).sequential(self.img_mlp)
-
-    # calculate the txt bloks
-    txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-    txt = txt + txt_mod2.gate * ((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift).sequential(self.txt_mlp)
-    return img, txt
-
-
-class SingleStreamBlock:
-  """
-  A DiT block with parallel linear layers as described in
-  https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-  """
-
-  def __init__(self,hidden_size:int, num_heads:int, mlp_ratio:float=4.0, qk_scale:Optional[float]=None):
-    self.hidden_dim = hidden_size
-    self.num_heads = num_heads
-    head_dim = hidden_size // num_heads
-    self.scale = qk_scale or head_dim**-0.5
-
-    self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-    # qkv and mlp_in
-    self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
-    # proj and mlp_out
-    self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
-
-    self.norm = QKNorm(head_dim)
-
-    self.hidden_size = hidden_size
-    self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-
-    self.mlp_act = Tensor.gelu
-    self.modulation = Modulation(hidden_size, double=False)
-
-  def __call__(self, x:Tensor, vec:Tensor, pe:Tensor) -> Tensor:
-    mod, _ = self.modulation(vec)
-    x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-    qkv, mlp = Tensor.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-    q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-    q, k = self.norm(q, k)
-
-    # compute attention
-    attn = attention(q, k, v, pe=pe)
-    # compute activation in mlp stream, cat again and run second linear layer
-    output = self.linear2(Tensor.cat(attn, self.mlp_act(mlp), dim=2))
-    return x + mod.gate * output
-
-
-class LastLayer:
-  def __init__(self, hidden_size:int, patch_size:int, out_channels:int):
-    self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-    self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
-    self.adaLN_modulation:List[Callable[[Tensor], Tensor]] = [Tensor.silu, nn.Linear(hidden_size, 2 * hidden_size, bias=True)]
-
-  def __call__(self, x:Tensor, vec:Tensor) -> Tensor:
-    shift, scale = vec.sequential(self.adaLN_modulation).chunk(2, dim=1)
-    x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-    return self.linear(x)
-
-def timestep_embedding(t:Tensor, dim:int, max_period:int=10000, time_factor:float=1000.0) -> Tensor:
-  """
-  Create sinusoidal timestep embeddings.
-  :param t: a 1-D Tensor of N indices, one per batch element.
-                    These may be fractional.
-  :param dim: the dimension of the output.
-  :param max_period: controls the minimum frequency of the embeddings.
-  :return: an (N, D) Tensor of positional embeddings.
-  """
-  t = time_factor * t
-  half = dim // 2
-  freqs = Tensor.exp(-math.log(max_period) * Tensor.arange(0, stop=half, dtype=dtypes.float32) / half).to(t.device)
-
-  args = t[:, None].float() * freqs[None]
-  embedding = Tensor.cat(Tensor.cos(args), Tensor.sin(args), dim=-1)
-  if dim % 2:  embedding = Tensor.cat(*[embedding, Tensor.zeros_like(embedding[:, :1])], dim=-1)
-  if Tensor.is_floating_point(t):  embedding = embedding.cast(t.dtype)
-  return embedding
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/model.py
-class Flux:
-  """
-  Transformer model for flow matching on sequences.
-  """
-
-  def __init__(
-      self,
-      guidance_embed:bool,
-      in_channels:int = 64,
-      vec_in_dim:int = 768,
-      context_in_dim:int = 4096,
-      hidden_size:int = 3072,
-      mlp_ratio:float = 4.0,
-      num_heads:int = 24,
-      depth:int = 19,
-      depth_single_blocks:int = 38,
-      axes_dim:Optional[List[int]] = None,
-      theta:int = 10_000,
-      qkv_bias:bool = True,
-      ):
-
-    axes_dim = axes_dim or [16, 56, 56]
-    self.guidance_embed = guidance_embed
-    self.in_channels = in_channels
-    self.out_channels = self.in_channels
-    if hidden_size % num_heads != 0:
-      raise ValueError(f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}")
-    pe_dim = hidden_size // num_heads
-    if sum(axes_dim) != pe_dim:
-      raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
-    self.hidden_size = hidden_size
-    self.num_heads = num_heads
-    self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
-    self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-    self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-    self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
-    self.guidance_in:Callable[[Tensor], Tensor] = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else tensor_identity
-    self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
-
-    self.double_blocks = [DoubleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias) for _ in range(depth)]
-    self.single_blocks = [SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio) for _ in range(depth_single_blocks)]
-    self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-
-  def __call__(self, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, timesteps:Tensor, y:Tensor, guidance:Optional[Tensor] = None) -> Tensor:
-    if img.ndim != 3 or txt.ndim != 3:
-      raise ValueError("Input img and txt tensors must have 3 dimensions.")
-    # running on sequences img
-    img = self.img_in(img)
-    vec = self.time_in(timestep_embedding(timesteps, 256))
-    if self.guidance_embed:
-      if guidance is None:
-        raise ValueError("Didn't get guidance strength for guidance distilled model.")
-      vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
-    vec = vec + self.vector_in(y)
-    txt = self.txt_in(txt)
-    ids = Tensor.cat(txt_ids, img_ids, dim=1)
-    pe = self.pe_embedder(ids)
-    for double_block in self.double_blocks:
-      img, txt = double_block(img=img, txt=txt, vec=vec, pe=pe)
-
-    img = Tensor.cat(txt, img, dim=1)
-    for single_block in self.single_blocks:
-      img = single_block(img, vec=vec, pe=pe)
-
-    img = img[:, txt.shape[1] :, ...]
-
-    return self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/util.py
-def load_flow_model(name:str, model_path:str):
-  # Loading Flux
-  print("Init model")
-  model = Flux(guidance_embed=(name != "flux-schnell"))
-  if not model_path: model_path = fetch(urls[name])
-  state_dict = {k.replace("scale", "weight"): v for k, v in safe_load(model_path).items()}
-  load_state_dict(model, state_dict)
-  return model
-
-def load_T5(max_length:int=512):
-  # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
-  print("Init T5")
-  T5 = T5Embedder(max_length, fetch(urls["T5_tokenizer"]))
-  pt_1 = fetch(urls["T5_1_of_2"])
-  pt_2 = fetch(urls["T5_2_of_2"])
-  load_state_dict(T5.encoder, safe_load(pt_1) | safe_load(pt_2), strict=False)
-  return T5
-
-def load_clip():
-  print("Init Clip")
-  clip = ClipEmbedder()
-  load_state_dict(clip.transformer, safe_load(fetch(urls["clip"])))
-  return clip
-
-def load_ae() -> AutoEncoder:
-  # Loading the autoencoder
-  print("Init AE")
-  ae = AutoEncoder(0.3611, 0.1159)
-  load_state_dict(ae, safe_load(fetch(urls["ae"])))
-  return ae
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/sampling.py
-def prepare(T5:T5Embedder, clip:ClipEmbedder, img:Tensor, prompt:Union[str, List[str]]) -> Dict[str, Tensor]:
-  bs, _, h, w = img.shape
-  if bs == 1 and not isinstance(prompt, str):
-    bs = len(prompt)
-
-  img = img.rearrange("b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-  if img.shape[0] == 1 and bs > 1:
-    img = img.expand((bs, *img.shape[1:]))
-
-  img_ids = Tensor.zeros(h // 2, w // 2, 3).contiguous()
-  img_ids[..., 1] = img_ids[..., 1] + Tensor.arange(h // 2)[:, None]
-  img_ids[..., 2] = img_ids[..., 2] + Tensor.arange(w // 2)[None, :]
-  img_ids = img_ids.rearrange("h w c -> 1 (h w) c")
-  img_ids = img_ids.expand((bs, *img_ids.shape[1:]))
-
-  if isinstance(prompt, str):
-    prompt = [prompt]
-  txt = T5(prompt).realize()
-  if txt.shape[0] == 1 and bs > 1:
-    txt = txt.expand((bs, *txt.shape[1:]))
-  txt_ids = Tensor.zeros(bs, txt.shape[1], 3)
-
-  vec = clip(prompt).realize()
-  if vec.shape[0] == 1 and bs > 1:
-    vec = vec.expand((bs, *vec.shape[1:]))
-
-  return {"img": img, "img_ids": img_ids.to(img.device), "txt": txt.to(img.device), "txt_ids": txt_ids.to(img.device), "vec": vec.to(img.device)}
-
-
-def get_schedule(num_steps:int, image_seq_len:int, base_shift:float=0.5, max_shift:float=1.15, shift:bool=True) -> List[float]:
-  # extra step for zero
-  step_size = -1.0 / num_steps
-  timesteps = Tensor.arange(1, 0 + step_size, step_size)
-
-  # shifting the schedule to favor high timesteps for higher signal images
-  if shift:
-    # estimate mu based on linear estimation between two points
-    mu = 0.5 + (max_shift - base_shift) * (image_seq_len - 256) / (4096 - 256)
-    timesteps = math.exp(mu) / (math.exp(mu) + (1 / timesteps - 1))
-  return timesteps.tolist()
-
-@TinyJit
-def run(model, *args): return model(*args).realize()
-
-def denoise(model, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, vec:Tensor, timesteps:List[float], guidance:float=4.0) -> Tensor:
-  # this is ignored for schnell
-  guidance_vec = Tensor((guidance,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
-  for t_curr, t_prev in tqdm(list(zip(timesteps[:-1], timesteps[1:])), "Denoising"):
-    t_vec = Tensor((t_curr,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
-    pred = run(model, img, img_ids, txt, txt_ids, t_vec, vec, guidance_vec)
-    img = img + (t_prev - t_curr) * pred
-
-  return img
-
-def unpack(x:Tensor, height:int, width:int) -> Tensor:
-  return x.rearrange("b (h w) (c ph pw) -> b c (h ph) (w pw)", h=math.ceil(height / 16), w=math.ceil(width / 16), ph=2, pw=2)
-
-# https://github.com/black-forest-labs/flux/blob/main/src/flux/cli.py
-if __name__ == "__main__":
-  default_prompt = "bananas and a can of coke"
-  parser = argparse.ArgumentParser(description="Run Flux.1", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-  parser.add_argument("--name",       type=str,   default="flux-schnell", help="Name of the model to load")
-  parser.add_argument("--model_path", type=str,   default="",             help="path of the model file")
-  parser.add_argument("--width",      type=int,   default=512,            help="width of the sample in pixels (should be a multiple of 16)")
-  parser.add_argument("--height",     type=int,   default=512,            help="height of the sample in pixels (should be a multiple of 16)")
-  parser.add_argument("--seed",       type=int,   default=None,           help="Set a seed for sampling")
-  parser.add_argument("--prompt",     type=str,   default=default_prompt, help="Prompt used for sampling")
-  parser.add_argument('--out',        type=str,   default=Path(tempfile.gettempdir()) / "rendered.png", help="Output filename")
-  parser.add_argument("--num_steps",  type=int,   default=None,           help="number of sampling steps (default 4 for schnell, 50 for guidance distilled)") #noqa:E501
-  parser.add_argument("--guidance",   type=float, default=3.5,            help="guidance value used for guidance distillation")
-  parser.add_argument("--output_dir", type=str,   default="output",       help="output directory")
-  args = parser.parse_args()
-
-  if args.name not in ["flux-schnell", "flux-dev"]:
-    raise ValueError(f"Got unknown model name: {args.name}, chose from flux-schnell and flux-dev")
-
-  if args.num_steps is None:
-    args.num_steps = 4 if args.name == "flux-schnell" else 50
-
-  # allow for packing and conversion to latent space
-  height = 16 * (args.height // 16)
-  width = 16 * (args.width // 16)
-
-  if args.seed is None: args.seed = Tensor._seed
-  else: Tensor.manual_seed(args.seed)
-
-  print(f"Generating with seed {args.seed}:\n{args.prompt}")
-  t0 = time.perf_counter()
-
-  # prepare input noise
-  x = Tensor.randn(1, 16, 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), dtype="bfloat16")
-
-  # load text embedders
-  T5 = load_T5(max_length=256 if args.name == "flux-schnell" else 512)
-  clip = load_clip()
-
-  # embed text to get inputs for model
-  inp = prepare(T5, clip, x, prompt=args.prompt)
-  timesteps = get_schedule(args.num_steps, inp["img"].shape[1], shift=(args.name != "flux-schnell"))
-
-  # done with text embedders
-  del T5, clip
-
-  # load model
-  model = load_flow_model(args.name, args.model_path)
-
-  # denoise initial noise
-  x = denoise(model, **inp, timesteps=timesteps, guidance=args.guidance)
-
-  # done with model
-  del model, run
-
-  # load autoencoder
-  ae = load_ae()
-
-  # decode latents to pixel space
-  x = unpack(x.float(), height, width)
-  x = ae.decode(x).realize()
-
-  t1 = time.perf_counter()
-  print(f"Done in {t1 - t0:.1f}s. Saving {args.out}")
-
-  # bring into PIL format and save
-  x = x.clamp(-1, 1)
-  x = x[0].rearrange("c h w -> h w c")
-  x = (127.5 * (x + 1.0)).cast("uint8")
-
-  img = Image.fromarray(x.numpy())
-
-  img.save(args.out)
-
-  # validation!
-  if args.prompt == default_prompt and args.name=="flux-schnell" and args.seed == 0 and args.width == args.height == 512:
-    ref_image = Tensor(np.array(Image.open("examples/flux1_seed0.png")))
-    distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
-    assert distance < 4e-3, colored(f"validation failed with {distance=}", "red")
-    print(colored(f"output validated with {distance=}", "green"))
--- a/examples/mask_rcnn.py
+++ b/examples/mask_rcnn.py
@@ -1,299 +0,0 @@
-from extra.models.mask_rcnn import MaskRCNN
-from extra.models.resnet import ResNet
-from extra.models.mask_rcnn import BoxList
-from torch.nn import functional as F
-from torchvision import transforms as T
-from torchvision.transforms import functional as Ft
-import random
-from tinygrad.tensor import Tensor
-from PIL import Image
-import numpy as np
-import torch
-import argparse
-import cv2
-
-
-class Resize:
-  def __init__(self, min_size, max_size):
-    if not isinstance(min_size, (list, tuple)):
-      min_size = (min_size,)
-    self.min_size = min_size
-    self.max_size = max_size
-
-  # modified from torchvision to add support for max size
-  def get_size(self, image_size):
-    w, h = image_size
-    size = random.choice(self.min_size)
-    max_size = self.max_size
-    if max_size is not None:
-      min_original_size = float(min((w, h)))
-      max_original_size = float(max((w, h)))
-      if max_original_size / min_original_size * size > max_size:
-        size = int(round(max_size * min_original_size / max_original_size))
-
-      if (w <= h and w == size) or (h <= w and h == size):
-        return (h, w)
-
-      if w < h:
-        ow = size
-        oh = int(size * h / w)
-      else:
-        oh = size
-        ow = int(size * w / h)
-
-      return (oh, ow)
-
-  def __call__(self, image):
-    size = self.get_size(image.size)
-    image = Ft.resize(image, size)
-    return image
-
-
-class Normalize:
-  def __init__(self, mean, std, to_bgr255=True):
-    self.mean = mean
-    self.std = std
-    self.to_bgr255 = to_bgr255
-
-  def __call__(self, image):
-    if self.to_bgr255:
-      image = image[[2, 1, 0]] * 255
-    else:
-      image = image[[0, 1, 2]] * 255
-    image = Ft.normalize(image, mean=self.mean, std=self.std)
-    return image
-
-transforms = lambda size_scale: T.Compose(
-  [
-    Resize(int(800*size_scale), int(1333*size_scale)),
-    T.ToTensor(),
-    Normalize(
-      mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.], to_bgr255=True
-    ),
-  ]
-)
-
-def expand_boxes(boxes, scale):
-  w_half = (boxes[:, 2] - boxes[:, 0]) * .5
-  h_half = (boxes[:, 3] - boxes[:, 1]) * .5
-  x_c = (boxes[:, 2] + boxes[:, 0]) * .5
-  y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-
-  w_half *= scale
-  h_half *= scale
-
-  boxes_exp = torch.zeros_like(boxes)
-  boxes_exp[:, 0] = x_c - w_half
-  boxes_exp[:, 2] = x_c + w_half
-  boxes_exp[:, 1] = y_c - h_half
-  boxes_exp[:, 3] = y_c + h_half
-  return boxes_exp
-
-
-def expand_masks(mask, padding):
-  N = mask.shape[0]
-  M = mask.shape[-1]
-  pad2 = 2 * padding
-  scale = float(M + pad2) / M
-  padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2))
-  padded_mask[:, :, padding:-padding, padding:-padding] = mask
-  return padded_mask, scale
-
-
-def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1):
-  # TODO: remove torch
-  mask = torch.tensor(mask.numpy())
-  box = torch.tensor(box.numpy())
-  padded_mask, scale = expand_masks(mask[None], padding=padding)
-  mask = padded_mask[0, 0]
-  box = expand_boxes(box[None], scale)[0]
-  box = box.to(dtype=torch.int32)
-
-  TO_REMOVE = 1
-  w = int(box[2] - box[0] + TO_REMOVE)
-  h = int(box[3] - box[1] + TO_REMOVE)
-  w = max(w, 1)
-  h = max(h, 1)
-
-  mask = mask.expand((1, 1, -1, -1))
-
-  mask = mask.to(torch.float32)
-  mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
-  mask = mask[0][0]
-
-  if thresh >= 0:
-    mask = mask > thresh
-  else:
-    mask = (mask * 255).to(torch.uint8)
-
-  im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8)
-  x_0 = max(box[0], 0)
-  x_1 = min(box[2] + 1, im_w)
-  y_0 = max(box[1], 0)
-  y_1 = min(box[3] + 1, im_h)
-
-  im_mask[y_0:y_1, x_0:x_1] = mask[
-                              (y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0])
-                              ]
-  return im_mask
-
-
-class Masker:
-  def __init__(self, threshold=0.5, padding=1):
-    self.threshold = threshold
-    self.padding = padding
-
-  def forward_single_image(self, masks, boxes):
-    boxes = boxes.convert("xyxy")
-    im_w, im_h = boxes.size
-    res = [
-      paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding)
-      for mask, box in zip(masks, boxes.bbox)
-    ]
-    if len(res) > 0:
-      res = torch.stack(*res, dim=0)[:, None]
-    else:
-      res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1]))
-    return Tensor(res.numpy())
-
-  def __call__(self, masks, boxes):
-    if isinstance(boxes, BoxList):
-      boxes = [boxes]
-
-    results = []
-    for mask, box in zip(masks, boxes):
-      result = self.forward_single_image(mask, box)
-      results.append(result)
-    return results
-
-
-masker = Masker(threshold=0.5, padding=1)
-
-def select_top_predictions(predictions, confidence_threshold=0.9):
-  scores = predictions.get_field("scores").numpy()
-  keep = [idx for idx, score in enumerate(scores) if score > confidence_threshold]
-  return predictions[keep]
-
-def compute_prediction(original_image, model, confidence_threshold, size_scale=1.0):
-  image = transforms(size_scale)(original_image).numpy()
-  image = Tensor(image, requires_grad=False)
-  predictions = model(image)
-  prediction = predictions[0]
-  prediction = select_top_predictions(prediction, confidence_threshold)
-  width, height = original_image.size
-  prediction = prediction.resize((width, height))
-
-  if prediction.has_field("mask"):
-    masks = prediction.get_field("mask")
-    masks = masker([masks], [prediction])[0]
-    prediction.add_field("mask", masks)
-  return prediction
-
-def compute_prediction_batched(batch, model, size_scale=1.0):
-  imgs = []
-  for img in batch:
-    imgs.append(transforms(size_scale)(img).numpy())
-  image = [Tensor(image, requires_grad=False) for image in imgs]
-  predictions = model(image)
-  del image
-  return predictions
-
-palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
-
-def findContours(*args, **kwargs):
-  if cv2.__version__.startswith('4'):
-    contours, hierarchy = cv2.findContours(*args, **kwargs)
-  elif cv2.__version__.startswith('3'):
-    _, contours, hierarchy = cv2.findContours(*args, **kwargs)
-  return contours, hierarchy
-
-def compute_colors_for_labels(labels):
-  l = labels[:, None]
-  colors = l * palette
-  colors = (colors % 255).astype("uint8")
-  return colors
-
-def overlay_mask(image, predictions):
-  image = np.asarray(image)
-  masks = predictions.get_field("mask").numpy()
-  labels = predictions.get_field("labels").numpy()
-
-  colors = compute_colors_for_labels(labels).tolist()
-
-  for mask, color in zip(masks, colors):
-    thresh = mask[0, :, :, None]
-    contours, hierarchy = findContours(
-        thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
-    )
-    image = cv2.drawContours(image, contours, -1, color, 3)
-
-  composite = image
-
-  return composite
-
-CATEGORIES = [
-    "__background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant",
-    "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
-    "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
-    "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
-    "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
-    "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster",
-    "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
-]
-
-def overlay_boxes(image, predictions):
-  labels = predictions.get_field("labels").numpy()
-  boxes = predictions.bbox
-  image = np.asarray(image)
-  colors = compute_colors_for_labels(labels).tolist()
-
-  for box, color in zip(boxes, colors):
-    box = torch.tensor(box.numpy())
-    box = box.to(torch.int64)
-    top_left, bottom_right = box[:2].tolist(), box[2:].tolist()
-    image = cv2.rectangle(
-        image, tuple(top_left), tuple(bottom_right), tuple(color), 1
-    )
-
-  return image
-
-def overlay_class_names(image, predictions):
-  scores = predictions.get_field("scores").numpy().tolist()
-  labels = predictions.get_field("labels").numpy().tolist()
-  labels = [CATEGORIES[int(i)] for i in labels]
-  boxes = predictions.bbox.numpy()
-  image = np.asarray(image)
-  template = "{}: {:.2f}"
-  for box, score, label in zip(boxes, scores, labels):
-    x, y = box[:2]
-    s = template.format(label, score)
-    x, y = int(x), int(y)
-    cv2.putText(
-        image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1
-    )
-
-  return image
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser(description='Run MaskRCNN', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-  parser.add_argument('--image', type=str, help="Path of the image to run")
-  parser.add_argument('--threshold', type=float, default=0.7, help="Detector threshold")
-  parser.add_argument('--size_scale', type=float, default=1.0, help="Image resize multiplier")
-  parser.add_argument('--out', type=str, default="/tmp/rendered.png", help="Output filename")
-  args = parser.parse_args()
-
-  resnet = ResNet(50, num_classes=None, stride_in_1x1=True)
-  model_tiny = MaskRCNN(resnet)
-  model_tiny.load_from_pretrained()
-  img = Image.open(args.image)
-  top_result_tiny = compute_prediction(img, model_tiny, confidence_threshold=args.threshold, size_scale=args.size_scale)
-  bbox_image = overlay_boxes(img, top_result_tiny)
-  mask_image = overlay_mask(bbox_image, top_result_tiny)
-  final_image = overlay_class_names(mask_image, top_result_tiny)
-
-  im = Image.fromarray(final_image)
-  print(f"saving {args.out}")
-  im.save(args.out)
-  im.show()
--- a/examples/openelm.py
+++ b/examples/openelm.py
@@ -1,118 +0,0 @@
-import json, pprint
-from tinygrad import fetch, nn, Tensor
-from tinygrad.helpers import DEBUG
-
-class FeedForward:
-  def __init__(self, model_dim, intermediate_dim):
-    self.proj_1 = nn.Linear(model_dim, 2*intermediate_dim, bias=False)
-    self.proj_2 = nn.Linear(intermediate_dim, model_dim, bias=False)
-
-  def __call__(self, x):
-    y_12 = self.proj_1(x)
-    y_1, y_2 = y_12.chunk(2, dim=-1)
-    return self.proj_2(y_1.silu() * y_2)
-
-# NOTE: this RoPE doesn't match LLaMA's?
-def _rotate_half(x: Tensor) -> Tensor:
-  x1, x2 = x.chunk(2, dim=-1)
-  return Tensor.cat(-x2, x1, dim=-1)
-
-def _apply_rotary_pos_emb(x: Tensor, pos_sin: Tensor, pos_cos: Tensor) -> Tensor:
-  return (x * pos_cos) + (_rotate_half(x) * pos_sin)
-
-class Attention:
-  def __init__(self, model_dim, num_query_heads, num_kv_heads, head_dim):
-    self.qkv_proj = nn.Linear(model_dim, (num_query_heads + num_kv_heads*2) * head_dim, bias=False)
-    self.num_query_heads, self.num_kv_heads = num_query_heads, num_kv_heads
-    self.head_dim = head_dim
-    self.q_norm = nn.RMSNorm(head_dim)
-    self.k_norm = nn.RMSNorm(head_dim)
-    self.out_proj = nn.Linear(num_query_heads * head_dim, model_dim, bias=False)
-
-  def __call__(self, x:Tensor) -> Tensor:
-    batch_size, seq_len, embed_dim = x.shape
-    qkv = self.qkv_proj(x)
-    qkv = qkv.reshape(batch_size, seq_len, self.num_query_heads+self.num_kv_heads*2, self.head_dim).transpose(1, 2)
-    xq,xk,xv = qkv.split([self.num_query_heads, self.num_kv_heads, self.num_kv_heads], dim=1)
-    xq = self.q_norm(xq)
-    xk = self.k_norm(xk)
-
-    # add positional embedding (how many kernels is this?)
-    freq_constant = 10000
-    inv_freq = 1.0 / (freq_constant ** (Tensor.arange(0, self.head_dim, 2) / self.head_dim))
-    pos_index_theta = Tensor.einsum("i,j->ij", Tensor.arange(seq_len), inv_freq)
-    emb = Tensor.cat(pos_index_theta, pos_index_theta, dim=-1)
-    cos_emb, sin_emb = emb.cos()[None, None, :, :], emb.sin()[None, None, :, :]
-    xq = _apply_rotary_pos_emb(xq, sin_emb, cos_emb)
-    xk = _apply_rotary_pos_emb(xk, sin_emb, cos_emb)
-
-    # grouped-query attention
-    num_groups = self.num_query_heads // self.num_kv_heads
-    xk = xk.repeat_interleave(num_groups, dim=1)
-    xv = xv.repeat_interleave(num_groups, dim=1)
-
-    # masked attention
-    #start_pos = 0
-    #mask = Tensor.full((1, 1, seq_len, start_pos+seq_len), float("-inf"), dtype=xq.dtype, device=xq.device).triu(start_pos+1)
-    #attn_output = xq.scaled_dot_product_attention(xk, xv, mask).transpose(1, 2)
-
-    # causal is fine, no mask needed
-    attn_output = xq.scaled_dot_product_attention(xk, xv, is_causal=True).transpose(1, 2)
-    return self.out_proj(attn_output.reshape(batch_size, seq_len, self.num_query_heads * self.head_dim))
-
-class Layer:
-  def __init__(self, model_dim, intermediate_dim, num_query_heads, num_kv_heads, head_dim):
-    self.ffn = FeedForward(model_dim, intermediate_dim)
-    self.attn = Attention(model_dim, num_query_heads, num_kv_heads, head_dim)
-    self.ffn_norm = nn.RMSNorm(model_dim)
-    self.attn_norm = nn.RMSNorm(model_dim)
-
-  def __call__(self, x:Tensor) -> Tensor: # (batch, seq_len, embed_dim)
-    x = x + self.attn(self.attn_norm(x))
-    x = x + self.ffn(self.ffn_norm(x))
-    return x
-
-# stupidly complex
-def make_divisible(v, divisor):
-  new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
-  if new_v < 0.9 * v: new_v += divisor
-  return new_v
-
-class Transformer:
-  def __init__(self, cfg):
-    if DEBUG >= 3: pprint.pp(cfg)
-    self.layers = [Layer(cfg['model_dim'], make_divisible(int(cfg["model_dim"] * cfg['ffn_multipliers'][i]), cfg['ffn_dim_divisor']),
-                         cfg['num_query_heads'][i], cfg['num_kv_heads'][i], cfg['head_dim']) for i in range(cfg['num_transformer_layers'])]
-    self.norm = nn.RMSNorm(cfg['model_dim'])
-    self.token_embeddings = nn.Embedding(cfg['vocab_size'], cfg['model_dim'])
-
-  def __call__(self, tokens:Tensor):
-    # _bsz, seqlen = tokens.shape
-    x = self.token_embeddings(tokens)
-    for l in self.layers: x = l(x)
-    return self.norm(x) @ self.token_embeddings.weight.T
-
-if __name__ == "__main__":
-  #model_name = "OpenELM-270M-Instruct"
-  model_name = "OpenELM-270M"  # this is fp32
-  model = Transformer(json.loads(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/config.json?download=true").read_bytes()))
-  weights = nn.state.safe_load(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/model.safetensors?download=true"))
-  if DEBUG >= 3:
-    for k, v in weights.items(): print(k, v.shape)
-  nn.state.load_state_dict(model, {k.removeprefix("transformer."):v for k,v in weights.items()})
-
-  from sentencepiece import SentencePieceProcessor
-  tokenizer = SentencePieceProcessor(fetch("https://github.com/karpathy/llama2.c/raw/master/tokenizer.model").as_posix())
-  toks = [tokenizer.bos_id()] + tokenizer.encode("Some car brands include")
-  for i in range(100):
-    ttoks = Tensor([toks])
-    out = model(ttoks).realize()
-    t0 = out[0].argmax(axis=-1).tolist()
-    toks.append(t0[-1])
-    # hmmm...passthrough still doesn't match (it shouldn't, it outputs the most likely)
-    print(tokenizer.decode(toks))
-    #print(toks)
-    #print(tokenizer.decode(t0))
-    #print(t0)
-
-
--- a/examples/other_mnist/beautiful_mnist_mlx.py
+++ b/examples/other_mnist/beautiful_mnist_mlx.py
@@ -1,55 +0,0 @@
-from tinygrad.helpers import trange
-from tinygrad.nn.datasets import mnist
-import mlx.core as mx
-import mlx.nn as nn
-import mlx.optimizers as optim
-from functools import partial
-
-class Model(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.c1 = nn.Conv2d(1, 32, 5)
-    self.c2 = nn.Conv2d(32, 32, 5)
-    self.bn1 = nn.BatchNorm(32)
-    self.m1 = nn.MaxPool2d(2)
-    self.c3 = nn.Conv2d(32, 64, 3)
-    self.c4 = nn.Conv2d(64, 64, 3)
-    self.bn2 = nn.BatchNorm(64)
-    self.m2 = nn.MaxPool2d(2)
-    self.lin = nn.Linear(576, 10)
-  def __call__(self, x):
-    x = mx.maximum(self.c1(x), 0)
-    x = mx.maximum(self.c2(x), 0)
-    x = self.m1(self.bn1(x))
-    x = mx.maximum(self.c3(x), 0)
-    x = mx.maximum(self.c4(x), 0)
-    x = self.m2(self.bn2(x))
-    return self.lin(mx.flatten(x, 1))
-
-if __name__ == "__main__":
-  X_train, Y_train, X_test, Y_test = mnist()
-  X_train = mx.array(X_train.float().permute((0,2,3,1)).numpy())
-  Y_train = mx.array(Y_train.numpy())
-  X_test = mx.array(X_test.float().permute((0,2,3,1)).numpy())
-  Y_test = mx.array(Y_test.numpy())
-
-  model = Model()
-  optimizer = optim.Adam(1e-3)
-  def loss_fn(model, x, y): return nn.losses.cross_entropy(model(x), y).mean()
-
-  state = [model.state, optimizer.state]
-  @partial(mx.compile, inputs=state, outputs=state)
-  def step(samples):
-    # Compiled functions will also treat any inputs not in the parameter list as constants.
-    X,Y = X_train[samples], Y_train[samples]
-    loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
-    loss, grads = loss_and_grad_fn(model, X, Y)
-    optimizer.update(model, grads)
-    return loss
-
-  test_acc = float('nan')
-  for i in (t:=trange(70)):
-    samples = mx.random.randint(0, X_train.shape[0], (512,))  # putting this in JIT didn't work well
-    loss = step(samples)
-    if i%10 == 9: test_acc = ((model(X_test).argmax(axis=-1) == Y_test).sum() * 100 / X_test.shape[0]).item()
-    t.set_description(f"loss: {loss.item():6.2f} test_accuracy: {test_acc:5.2f}%")
--- a/examples/rl/lightupbutton.py
+++ b/examples/rl/lightupbutton.py
@@ -1,45 +0,0 @@
-import gymnasium as gym
-import numpy as np
-from gymnasium.envs.registration import register
-
-# a very simple game
-# one of <size> lights will light up
-# take the action of the lit up light
-# in <hard_mode>, you act differently based on the step number and need to track this
-
-class PressTheLightUpButton(gym.Env):
-  metadata = {"render_modes": []}
-  def __init__(self, render_mode=None, size=2, game_length=10, hard_mode=False):
-    self.size, self.game_length = size, game_length
-    self.observation_space = gym.spaces.Box(0, 1, shape=(self.size,), dtype=np.float32)
-    self.action_space = gym.spaces.Discrete(self.size)
-    self.step_num = 0
-    self.done = True
-    self.hard_mode = hard_mode
-
-  def _get_obs(self):
-    obs = [0]*self.size
-    if self.step_num < len(self.state):
-      obs[self.state[self.step_num]] = 1
-    return np.array(obs, dtype=np.float32)
-
-  def reset(self, seed=None, options=None):
-    super().reset(seed=seed)
-    self.state = np.random.randint(0, self.size, size=self.game_length)
-    self.step_num = 0
-    self.done = False
-    return self._get_obs(), {}
-
-  def step(self, action):
-    target = ((action + self.step_num) % self.size) if self.hard_mode else action
-    reward = int(target == self.state[self.step_num])
-    self.step_num += 1
-    if not reward:
-      self.done = True
-    return self._get_obs(), reward, self.done, self.step_num >= self.game_length, {}
-
-register(
-  id="PressTheLightUpButton-v0",
-  entry_point="examples.rl.lightupbutton:PressTheLightUpButton",
-  max_episode_steps=None,
-)
--- a/examples/serious_mnist.py
+++ b/examples/serious_mnist.py
@@ -1,136 +0,0 @@
-#!/usr/bin/env python
-#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
-import sys
-import numpy as np
-from tinygrad.nn.state import get_parameters
-from tinygrad.tensor import Tensor
-from tinygrad.nn import BatchNorm2d, optim
-from tinygrad.helpers import getenv
-from extra.datasets import fetch_mnist
-from extra.augment import augment_img
-from extra.training import train, evaluate
-GPU = getenv("GPU")
-QUICK = getenv("QUICK")
-DEBUG = getenv("DEBUG")
-
-class SqueezeExciteBlock2D:
-  def __init__(self, filters):
-    self.filters = filters
-    self.weight1 = Tensor.scaled_uniform(self.filters, self.filters//32)
-    self.bias1 = Tensor.scaled_uniform(1,self.filters//32)
-    self.weight2 = Tensor.scaled_uniform(self.filters//32, self.filters)
-    self.bias2 = Tensor.scaled_uniform(1, self.filters)
-
-  def __call__(self, input):
-    se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
-    se = se.reshape(shape=(-1, self.filters))
-    se = se.dot(self.weight1) + self.bias1
-    se = se.relu()
-    se = se.dot(self.weight2) + self.bias2
-    se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting
-    se = input.mul(se)
-    return se
-
-class ConvBlock:
-  def __init__(self, h, w, inp, filters=128, conv=3):
-    self.h, self.w = h, w
-    self.inp = inp
-    #init weights
-    self.cweights = [Tensor.scaled_uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
-    self.cbiases = [Tensor.scaled_uniform(1, filters, 1, 1) for i in range(3)]
-    #init layers
-    self._bn = BatchNorm2d(128)
-    self._seb = SqueezeExciteBlock2D(filters)
-
-  def __call__(self, input):
-    x = input.reshape(shape=(-1, self.inp, self.w, self.h))
-    for cweight, cbias in zip(self.cweights, self.cbiases):
-      x = x.pad(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
-    x = self._bn(x)
-    x = self._seb(x)
-    return x
-
-class BigConvNet:
-  def __init__(self):
-    self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
-    self.weight1 = Tensor.scaled_uniform(128,10)
-    self.weight2 = Tensor.scaled_uniform(128,10)
-
-  def parameters(self):
-    if DEBUG: #keeping this for a moment
-      pars = [par for par in get_parameters(self) if par.requires_grad]
-      no_pars = 0
-      for par in pars:
-        print(par.shape)
-        no_pars += np.prod(par.shape)
-      print('no of parameters', no_pars)
-      return pars
-    else:
-      return get_parameters(self)
-
-  def save(self, filename):
-    with open(filename+'.npy', 'wb') as f:
-      for par in get_parameters(self):
-        #if par.requires_grad:
-        np.save(f, par.numpy())
-
-  def load(self, filename):
-    with open(filename+'.npy', 'rb') as f:
-      for par in get_parameters(self):
-        #if par.requires_grad:
-        try:
-          par.numpy()[:] = np.load(f)
-          if GPU:
-            par.gpu()
-        except:
-          print('Could not load parameter')
-
-  def forward(self, x):
-    x = self.conv[0](x)
-    x = self.conv[1](x)
-    x = x.avg_pool2d(kernel_size=(2,2))
-    x = self.conv[2](x)
-    x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
-    x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
-    xo = x1.dot(self.weight1) + x2.dot(self.weight2)
-    return xo
-
-
-if __name__ == "__main__":
-  lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
-  epochss = [2, 1] if QUICK else [13, 3, 3, 1]
-  BS = 32
-
-  lmbd = 0.00025
-  lossfn = lambda out,y: out.sparse_categorical_crossentropy(y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
-  X_train, Y_train, X_test, Y_test = fetch_mnist()
-  X_train = X_train.reshape(-1, 28, 28).astype(np.uint8)
-  X_test = X_test.reshape(-1, 28, 28).astype(np.uint8)
-  steps = len(X_train)//BS
-  np.random.seed(1337)
-  if QUICK:
-    steps = 1
-    X_test, Y_test = X_test[:BS], Y_test[:BS]
-
-  model = BigConvNet()
-
-  if len(sys.argv) > 1:
-    try:
-      model.load(sys.argv[1])
-      print('Loaded weights "'+sys.argv[1]+'", evaluating...')
-      evaluate(model, X_test, Y_test, BS=BS)
-    except:
-      print('could not load weights "'+sys.argv[1]+'".')
-
-  if GPU:
-    params = get_parameters(model)
-    [x.gpu_() for x in params]
-
-  for lr, epochs in zip(lrs, epochss):
-    optimizer = optim.Adam(model.parameters(), lr=lr)
-    for epoch in range(1,epochs+1):
-      #first epoch without augmentation
-      X_aug = X_train if epoch == 1 else augment_img(X_train)
-      train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, BS=BS)
-      accuracy = evaluate(model, X_test, Y_test, BS=BS)
-      model.save(f'examples/checkpoint{accuracy * 1e6:.0f}')
--- a/examples/simple_conv_bn.py
+++ b/examples/simple_conv_bn.py
@@ -1,17 +0,0 @@
-from tinygrad.tensor import Tensor
-from tinygrad.nn import Conv2d, BatchNorm2d
-from tinygrad.nn.state import get_parameters
-
-if __name__ == "__main__":
-  with Tensor.train():
-
-    BS, C1, H, W = 4, 16, 224, 224
-    C2, K, S, P = 64, 7, 2, 1
-
-    x = Tensor.uniform(BS, C1, H, W)
-    conv = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
-    bn = BatchNorm2d(C2, track_running_stats=False)
-    for t in get_parameters([x, conv, bn]): t.realize()
-
-    print("running network")
-    x.sequential([conv, bn]).numpy()
--- a/examples/so_vits_svc.py
+++ b/examples/so_vits_svc.py
@@ -1,669 +0,0 @@
-# original implementation: https://github.com/svc-develop-team/so-vits-svc
-from __future__ import annotations
-import sys, logging, time, io, math, argparse, operator, numpy as np
-from functools import partial, reduce
-from pathlib import Path
-from typing import Tuple, Optional, Type
-from tinygrad import nn, dtypes, Tensor
-from tinygrad.helpers import getenv, fetch
-from tinygrad.nn.state import torch_load
-from examples.vits import ResidualCouplingBlock, PosteriorEncoder, Encoder, ResBlock1, ResBlock2, LRELU_SLOPE, sequence_mask, split, get_hparams_from_file, load_checkpoint, weight_norm, HParams
-from examples.sovits_helpers import preprocess
-import soundfile
-
-DEBUG = getenv("DEBUG")
-
-F0_BIN = 256
-F0_MAX = 1100.0
-F0_MIN = 50.0
-F0_MEL_MIN = 1127 * np.log(1 + F0_MIN / 700)
-F0_MEL_MAX = 1127 * np.log(1 + F0_MAX / 700)
-
-class SpeechEncoder:
-  def __init__(self, hidden_dim, model:ContentVec): self.hidden_dim, self.model = hidden_dim, model
-  def encode(self, ): raise NotImplementedError("implement me")
-  @classmethod
-  def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
-    contentvec = ContentVec.load_from_pretrained(checkpoint_path, checkpoint_url)
-    return cls(contentvec)
-
-class ContentVec256L9(SpeechEncoder):
-  def __init__(self, model:ContentVec): super().__init__(hidden_dim=256, model=model)
-  def encode(self, wav: Tensor):
-    feats = wav
-    if len(feats.shape) == 2:  # double channels
-      feats = feats.mean(-1)
-    assert len(feats.shape) == 1, feats.dim()
-    feats = feats.reshape(1, -1)
-    padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
-    logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=9)
-    feats = self.model.final_proj(logits[0])
-    return feats.transpose(1,2)
-
-class ContentVec768L12(SpeechEncoder):
-  def __init__(self, model:ContentVec): super().__init__(hidden_dim=768, model=model)
-  def encode(self, wav: Tensor):
-    feats = wav
-    if len(feats.shape) == 2:  # double channels
-      feats = feats.mean(-1)
-    assert len(feats.shape) == 1, feats.dim()
-    feats = feats.reshape(1, -1)
-    padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
-    logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=12)
-    return logits[0].transpose(1,2)
-
-# original code for contentvec: https://github.com/auspicious3000/contentvec/
-class ContentVec:
-  # self.final_proj dims are hardcoded and depend on fairseq.data.dictionary Dictionary in the checkpoint. This param can't yet be loaded since there is no pickle for it. See with DEBUG=2.
-  # This means that the ContentVec only works with the hubert weights used in all SVC models
-  def __init__(self, cfg: HParams):
-    self.feature_grad_mult, self.untie_final_proj = cfg.feature_grad_mult, cfg.untie_final_proj
-    feature_enc_layers = eval(cfg.conv_feature_layers)
-    self.embed = feature_enc_layers[-1][0]
-    final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
-    self.feature_extractor = ConvFeatureExtractionModel(conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias)
-    self.post_extract_proj = nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None
-    self.encoder = TransformerEncoder(cfg)
-    self.layer_norm = nn.LayerNorm(self.embed)
-    self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim * 1) if self.untie_final_proj else nn.Linear(cfg.encoder_embed_dim, final_dim)
-    self.mask_emb = Tensor.uniform(cfg.encoder_embed_dim, dtype=dtypes.float32)
-    self.label_embs_concat = Tensor.uniform(504, final_dim, dtype=dtypes.float32)
-  def forward_features(self, source, padding_mask):
-    if self.feature_grad_mult > 0:
-      features = self.feature_extractor(source, padding_mask)
-      if self.feature_grad_mult != 1.0: pass  # training: GradMultiply.forward(features, self.feature_grad_mult)
-    else:
-      features = self.feature_extractor(source, padding_mask)
-    return features
-  def forward_padding_mask(self, features, padding_mask):  # replaces original forward_padding_mask for batch inference
-    lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1)  # ensure its bool for tilde
-    lengths = (lengths_org - 400).float().div(320).floor().cast(dtypes.int64) + 1  # intermediate float to divide
-    padding_mask = lengths_to_padding_mask(lengths)
-    return padding_mask
-  def extract_features(self, source: Tensor, spk_emb:Tensor=None, padding_mask=None, ret_conv=False, output_layer=None, tap=False):
-    features = self.forward_features(source, padding_mask)
-    if padding_mask is not None:
-      padding_mask = self.forward_padding_mask(features, padding_mask)
-    features = features.transpose(1, 2)
-    features = self.layer_norm(features)
-    if self.post_extract_proj is not None:
-      features = self.post_extract_proj(features)
-    x, _ = self.encoder(features, spk_emb, padding_mask=padding_mask, layer=(None if output_layer is None else output_layer - 1), tap=tap)
-    res = features if ret_conv else x
-    return res, padding_mask
-  @classmethod
-  def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
-    fetch(checkpoint_url, checkpoint_path)
-    cfg = load_fairseq_cfg(checkpoint_path)
-    enc = cls(cfg.model)
-    _ = load_checkpoint_enc(checkpoint_path, enc, None)
-    logging.debug(f"{cls.__name__}: Loaded model with cfg={cfg}")
-    return enc
-
-class TransformerEncoder:
-  def __init__(self, cfg: HParams):
-    def make_conv() -> nn.Conv1d:
-      layer = nn.Conv1d(self.embedding_dim, self.embedding_dim, kernel_size=cfg.conv_pos, padding=cfg.conv_pos // 2, groups=cfg.conv_pos_groups)
-      std = std = math.sqrt(4 / (cfg.conv_pos * self.embedding_dim))
-      layer.weight, layer.bias = (Tensor.normal(*layer.weight.shape, std=std)), (Tensor.zeros(*layer.bias.shape))
-      # for training: layer.weights need to be weight_normed
-      return layer
-    self.dropout, self.embedding_dim, self.layer_norm_first, self.layerdrop, self.num_layers, self.num_layers_1 = cfg.dropout, cfg.encoder_embed_dim, cfg.layer_norm_first, cfg.encoder_layerdrop, cfg.encoder_layers, cfg.encoder_layers_1
-    self.pos_conv, self.pos_conv_remove = [make_conv()], (1 if cfg.conv_pos % 2 == 0 else 0)
-    self.layers = [
-      TransformerEncoderLayer(self.embedding_dim, cfg.encoder_ffn_embed_dim, cfg.encoder_attention_heads, self.dropout, cfg.attention_dropout, cfg.activation_dropout, cfg.activation_fn, self.layer_norm_first, cond_layer_norm=(i >= cfg.encoder_layers))
-      for i in range(cfg.encoder_layers + cfg.encoder_layers_1)
-      ]
-    self.layer_norm = nn.LayerNorm(self.embedding_dim)
-    self.cond_layer_norm = CondLayerNorm(self.embedding_dim) if cfg.encoder_layers_1 > 0 else None
-    # training: apply init_bert_params
-  def __call__(self, x, spk_emb, padding_mask=None, layer=None, tap=False):
-    x, layer_results = self.extract_features(x, spk_emb, padding_mask, layer, tap)
-    if self.layer_norm_first and layer is None:
-      x = self.cond_layer_norm(x, spk_emb) if (self.num_layers_1 > 0) else self.layer_norm(x)
-    return x, layer_results
-  def extract_features(self, x: Tensor, spk_emb: Tensor, padding_mask=None, tgt_layer=None, tap=False):
-    if tgt_layer is not None:  # and not self.training
-      assert tgt_layer >= 0 and tgt_layer < len(self.layers)
-    if padding_mask is not None:
-      # x[padding_mask] = 0
-      assert padding_mask.shape == x.shape[:len(padding_mask.shape)]  # first few dims of x must match padding_mask
-      tmp_mask = padding_mask.unsqueeze(-1).repeat((1, 1, x.shape[-1]))
-      tmp_mask = tilde(tmp_mask.cast(dtypes.bool))
-      x = tmp_mask.where(x, 0)
-    x_conv = self.pos_conv[0](x.transpose(1,2))
-    if self.pos_conv_remove > 0: x_conv = x_conv[:, :, : -self.pos_conv_remove]
-    x_conv = x_conv.gelu().transpose(1, 2)
-    x = (x + x_conv).transpose(0, 1)  # B x T x C -> T x B x C
-    if not self.layer_norm_first: x = self.layer_norm(x)
-    x = x.dropout(p=self.dropout)
-    layer_results = []
-    r = None
-    for i, layer in enumerate(self.layers):
-      if i < self.num_layers:  # if (not self.training or (dropout_probability > self.layerdrop)) and (i < self.num_layers):
-        assert layer.cond_layer_norm == False
-        x = layer(x, self_attn_padding_mask=padding_mask, need_weights=False)
-        if tgt_layer is not None or tap:
-          layer_results.append(x.transpose(0, 1))
-      if i>= self.num_layers:
-        assert layer.cond_layer_norm == True
-        x = layer(x, emb=spk_emb, self_attn_padding_mask=padding_mask, need_weights=False)
-      if i == tgt_layer:
-        r = x
-        break
-    if r is not None:
-      x = r
-    x = x.transpose(0, 1)  # T x B x C -> B x T x C
-    return x, layer_results
-
-class TransformerEncoderLayer:
-  def __init__(self, embedding_dim=768.0, ffn_embedding_dim=3072.0, num_attention_heads=8.0, dropout=0.1, attention_dropout=0.1, activation_dropout=0.1, activation_fn="relu", layer_norm_first=False, cond_layer_norm=False):
-    def get_activation_fn(activation):
-      if activation == "relu": return Tensor.relu
-      if activation == "gelu": return Tensor.gelu
-      else: raise RuntimeError(f"activation function={activation} is not forseen")
-    self.embedding_dim, self.dropout, self.activation_dropout, self.layer_norm_first, self.num_attention_heads, self.cond_layer_norm, self.activation_fn = embedding_dim, dropout, activation_dropout, layer_norm_first, num_attention_heads, cond_layer_norm, get_activation_fn(activation_fn)
-    self.self_attn = MultiHeadAttention(self.embedding_dim, self.num_attention_heads)
-    self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
-    self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
-    self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
-    self.final_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
-  def __call__(self, x:Tensor, self_attn_mask:Tensor=None, self_attn_padding_mask:Tensor=None, emb:Tensor=None, need_weights=False):
-    #self_attn_padding_mask = self_attn_padding_mask.reshape(x.shape[0], 1, 1, self_attn_padding_mask.shape[1]).expand(-1, self.num_attention_heads, -1, -1).reshape(x.shape[0] * self.num_attention_heads, 1, self_attn_padding_mask.shape[1]) if self_attn_padding_mask is not None else None
-    assert self_attn_mask is None and self_attn_padding_mask is not None
-    residual = x
-    if self.layer_norm_first:
-      x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
-      x = self.self_attn(x=x, mask=self_attn_padding_mask)
-      x = x.dropout(self.dropout)
-      x = residual + x
-      x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
-      x = self.activation_fn(self.fc1(x))
-      x = x.dropout(self.activation_dropout)
-      x = self.fc2(x)
-      x = x.dropout(self.dropout)
-      x = residual + x
-    else:
-      x = self.self_attn(x=x, mask=self_attn_padding_mask)
-      x = x.dropout(self.dropout)
-      x = residual + x
-      x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
-      residual = x
-      x = self.activation_fn(self.fc1(x))
-      x = x.dropout(self.activation_dropout)
-      x = self.fc2(x)
-      x = x.dropout(self.dropout)
-      x = residual + x
-      x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
-    return x
-
-class MultiHeadAttention:
-  def __init__(self, n_state, n_head):
-    self.n_state, self.n_head = n_state, n_head
-    self.q_proj, self.k_proj, self.v_proj, self.out_proj = [nn.Linear(n_state, n_state) for _ in range(4)]
-  def __call__(self, x:Tensor, xa:Optional[Tensor]=None, mask:Optional[Tensor]=None):
-    x = x.transpose(0,1)  # TxBxC -> BxTxC
-    q, k, v = self.q_proj(x), self.k_proj(xa or x), self.v_proj(xa or x)
-    q, k, v = [x.reshape(*q.shape[:2], self.n_head, -1) for x in (q, k, v)]
-    wv = Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), None).transpose(1, 2).reshape(*x.shape[:2], -1)
-    ret =  self.out_proj(wv).transpose(0,1)  # BxTxC -> TxBxC
-    return ret
-
-class ConvFeatureExtractionModel:
-  def __init__(self, conv_layers, dropout=.0, mode="default", conv_bias=False):
-    assert mode in {"default", "group_norm_masked", "layer_norm"}
-    def block(n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False):
-      def make_conv():
-        conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
-        conv.weight = Tensor.kaiming_normal(*conv.weight.shape)
-        return conv
-      assert (is_layer_norm and is_group_norm) == False, "layer norm and group norm are exclusive"
-      if is_layer_norm:
-        return [make_conv(), partial(Tensor.dropout, p=dropout),[partial(Tensor.transpose, dim0=-2, dim1=-1), nn.LayerNorm(dim, elementwise_affine=True), partial(Tensor.transpose, dim0=-2, dim1=-1)], Tensor.gelu]
-      elif is_group_norm and mode == "default":
-        return [make_conv(), partial(Tensor.dropout, p=dropout), nn.GroupNorm(dim, dim, affine=True), Tensor.gelu]
-      elif is_group_norm and mode == "group_norm_masked":
-        return [make_conv(), partial(Tensor.dropout, p=dropout), GroupNormMasked(dim, dim, affine=True), Tensor.gelu]
-      else:
-        return [make_conv(), partial(Tensor.dropout, p=dropout), Tensor.gelu]
-    in_d, self.conv_layers, self.mode = 1, [], mode
-    for i, cl in enumerate(conv_layers):
-      assert len(cl) == 3, "invalid conv definition: " + str(cl)
-      (dim, k, stride) = cl
-      if i == 0: self.cl = cl
-      self.conv_layers.append(block(in_d, dim, k, stride, is_layer_norm=(mode == "layer_norm"), is_group_norm=((mode == "default" or mode == "group_norm_masked") and i == 0), conv_bias=conv_bias))
-      in_d = dim
-  def __call__(self, x:Tensor, padding_mask:Tensor):
-    x = x.unsqueeze(1)  # BxT -> BxCxT
-    if self.mode == "group_norm_masked":
-      if padding_mask is not None:
-        _, k, stride = self.cl
-        lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1)  # ensure padding_mask is bool for tilde
-        lengths = (((lengths_org - k) / stride) + 1).floor().cast(dtypes.int64)
-        padding_mask = tilde(lengths_to_padding_mask(lengths)).cast(dtypes.int64)  # lengths_to_padding_mask returns bool tensor
-      x = self.conv_layers[0][0](x)  # padding_mask is numeric
-      x = self.conv_layers[0][1](x)
-      x = self.conv_layers[0][2](x, padding_mask)
-      x = self.conv_layers[0][3](x)
-    else:
-      x = x.sequential(self.conv_layers[0])  # default
-    for _, conv in enumerate(self.conv_layers[1:], start=1):
-      conv = reduce(lambda a,b: operator.iconcat(a,b if isinstance(b, list) else [b]), conv, [])  # flatten
-      x = x.sequential(conv)
-    return x
-
-class CondLayerNorm:  # https://github.com/auspicious3000/contentvec/blob/main/contentvec/modules/cond_layer_norm.py#L10
-  def __init__(self, dim_last, eps=1e-5, dim_spk=256, elementwise_affine=True):
-    self.dim_last, self.eps, self.dim_spk, self.elementwise_affine = dim_last, eps, dim_spk, elementwise_affine
-    if self.elementwise_affine:
-      self.weight_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
-      self.bias_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
-      self.weight_ln.weight, self.bias_ln.weight = (Tensor.ones(*self.weight_ln.weight.shape)), (Tensor.zeros(*self.bias_ln.weight.shape))
-  def __call__(self, x: Tensor, spk_emb: Tensor):
-    axis = tuple(-1-i for i in range(len(x.shape[1:])))
-    x = x.layernorm(axis=axis, eps=self.eps)
-    if not self.elementwise_affine: return x
-    weights, bias = self.weight_ln(spk_emb), self.bias_ln(spk_emb)
-    return weights * x + bias
-
-class GroupNormMasked:  # https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/modules/fp32_group_norm.py#L16
-  def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
-    self.num_groups, self.num_channels, self.eps, self.affine = num_groups, num_channels, eps, affine
-    self.weight, self.bias = (Tensor.ones(num_channels)), (Tensor.zeros(num_channels)) if self.affine else (None, None)
-  def __call__(self, x:Tensor, mask:Tensor):
-    bsz, n_c, length = x.shape
-    assert n_c % self.num_groups == 0
-    x = x.reshape(bsz, self.num_groups, n_c // self.num_groups, length)
-    if mask is None: mask = Tensor.ones_like(x)
-    else: mask = mask.reshape(bsz, 1, 1, length)
-    x = x * mask
-    lengths = mask.sum(axis=3, keepdim=True)
-    assert x.shape[2] == 1
-    mean_ = x.mean(dim=3, keepdim=True)
-    mean = mean_ * length / lengths
-    var = (((x.std(axis=3, keepdim=True) ** 2) + mean_**2) * length / lengths - mean**2) + self.eps
-    return x.add(-mean).div(var.sqrt()).reshape(bsz, n_c, length).mul(self.weight.reshape(1,-1,1)).add(self.bias.reshape(1,-1,1))
-
-class Synthesizer:
-  def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, ssl_dim, n_speakers, sampling_rate=44100, vol_embedding=False, n_flow_layer=4, **kwargs):
-    self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.vol_embedding = spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, vol_embedding
-    self.emb_g = nn.Embedding(n_speakers, gin_channels)
-    if vol_embedding: self.emb_vol = nn.Linear(1, hidden_channels)
-    self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
-    self.enc_p = TextEncoder(inter_channels, hidden_channels, kernel_size, n_layers, filter_channels=filter_channels, n_heads=n_heads, p_dropout=p_dropout)
-    self.dec = Generator(sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels)
-    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels)
-    self.emb_uv = nn.Embedding(vocab_size=2, embed_size=hidden_channels)
-  def infer(self, c:Tensor, f0:Tensor, uv:Tensor, g:Tensor=None, noise_scale=0.35, seed=52468, vol=None) -> Tuple[Tensor, Tensor]:
-    Tensor.manual_seed(getenv('SEED', seed))
-    c_lengths = (Tensor.ones([c.shape[0]]) * c.shape[-1]).to(c.device)
-    if len(g.shape) == 1: g = g.unsqueeze(0)
-    g = self.emb_g(g).transpose(1, 2)
-    x_mask = sequence_mask(c_lengths, c.shape[2]).unsqueeze(1).cast(c.dtype)
-    vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
-    x = self.pre(c) * x_mask + self.emb_uv(uv.cast(dtypes.int64)).transpose(1, 2) + vol
-    z_p, _, _, c_mask = self.enc_p.forward(x, x_mask, f0=self._f0_to_coarse(f0), noise_scale=noise_scale)
-    z = self.flow.forward(z_p, c_mask, g=g, reverse=True)
-    o = self.dec.forward(z * c_mask, g=g, f0=f0)
-    return o,f0
-  def _f0_to_coarse(self, f0 : Tensor):
-    f0_mel = 1127 * (1 + f0 / 700).log()
-    a = (F0_BIN - 2) / (F0_MEL_MAX - F0_MEL_MIN)
-    b = F0_MEL_MIN * a - 1.
-    f0_mel = (f0_mel > 0).where(f0_mel * a - b, f0_mel)
-    f0_coarse = f0_mel.ceil().cast(dtype=dtypes.int64)
-    f0_coarse = f0_coarse * (f0_coarse > 0)
-    f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
-    f0_coarse = f0_coarse * (f0_coarse < F0_BIN)
-    f0_coarse = f0_coarse + ((f0_coarse >= F0_BIN) * (F0_BIN - 1))
-    return f0_coarse
-  @classmethod
-  def load_from_pretrained(cls, config_path:str, config_url:str, weights_path:str, weights_url:str) -> Synthesizer:
-    fetch(config_url, config_path)
-    hps = get_hparams_from_file(config_path)
-    fetch(weights_url, weights_path)
-    net_g = cls(hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model)
-    _ = load_checkpoint(weights_path, net_g, None, skip_list=["f0_decoder"])
-    logging.debug(f"{cls.__name__}:Loaded model with hps: {hps}")
-    return net_g, hps
-
-class TextEncoder:
-  def __init__(self, out_channels, hidden_channels, kernel_size, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None):
-    self.out_channels, self.hidden_channels, self.kernel_size, self.n_layers, self.gin_channels = out_channels, hidden_channels, kernel_size, n_layers, gin_channels
-    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    self.f0_emb = nn.Embedding(256, hidden_channels)  # n_vocab = 256
-    self.enc_ = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
-  def forward(self, x, x_mask, f0=None, noise_scale=1):
-    x = x + self.f0_emb(f0).transpose(1, 2)
-    x = self.enc_.forward(x * x_mask, x_mask)
-    stats = self.proj(x) * x_mask
-    m, logs = split(stats, self.out_channels, dim=1)
-    z = (m + randn_like(m) * logs.exp() * noise_scale) * x_mask
-    return z, m, logs, x_mask
-
-class Upsample:
-  def __init__(self, scale_factor):
-    assert scale_factor % 1 == 0, "Only integer scale factor allowed."
-    self.scale = int(scale_factor)
-  def forward(self, x:Tensor):
-    repeats = tuple([1] * len(x.shape) + [self.scale])
-    new_shape = (*x.shape[:-1], x.shape[-1] * self.scale)
-    return x.unsqueeze(-1).repeat(repeats).reshape(new_shape)
-
-class SineGen:
-  def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voice_threshold=0, flag_for_pulse=False):
-    self.sine_amp, self.noise_std, self.harmonic_num, self.sampling_rate, self.voiced_threshold, self.flag_for_pulse = sine_amp, noise_std, harmonic_num, samp_rate, voice_threshold, flag_for_pulse
-    self.dim = self.harmonic_num + 1
-  def _f02uv(self, f0): return (f0 > self.voiced_threshold).float()  #generate uv signal
-  def _f02sine(self, f0_values):
-    def padDiff(x : Tensor): return (x.pad((0,0,-1,1)) - x).pad((0,0,0,-1))
-    def mod(x: Tensor, n: int) -> Tensor: return x - n * x.div(n).floor()  # this is what the % operator does in pytorch.
-    rad_values = mod((f0_values / self.sampling_rate) , 1)  # convert to F0 in rad
-    rand_ini = Tensor.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)  # initial phase noise
-
-    #rand_ini[:, 0] = 0
-    m = Tensor.ones(f0_values.shape[0]).unsqueeze(1).pad((0,f0_values.shape[2]-1,0,0)).cast(dtypes.bool)
-    m = tilde(m)
-    rand_ini = m.where(rand_ini, 0)
-
-    #rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
-    tmp = rad_values[:, 0, :] + rand_ini
-    m = Tensor.ones(tmp.shape).pad((0,0,0,rad_values.shape[1]-1,0)).cast(dtypes.bool)
-    m = tilde(m)
-    tmp = tmp.unsqueeze(1).pad((0,0,0,rad_values.shape[1]-1,0))
-    rad_values = m.where(rad_values, tmp)
-
-    tmp_over_one = mod(rad_values.cumsum(1), 1)
-    tmp_over_one_idx = padDiff(tmp_over_one) < 0
-    cumsum_shift = Tensor.zeros_like(rad_values)
-
-    #cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
-    tmp_over_one_idx = (tmp_over_one_idx * -1.0).pad((0,0,1,0))
-    cumsum_shift = tmp_over_one_idx
-
-    sines = ((rad_values + cumsum_shift).cumsum(1) * 2 * np.pi).sin()
-    return sines
-  def forward(self, f0, upp=None):
-    fn = f0.mul(Tensor([[range(1, self.harmonic_num + 2)]], dtype=dtypes.float32).to(f0.device))
-    sine_waves = self._f02sine(fn) * self.sine_amp  #generate sine waveforms
-    uv = self._f02uv(f0)  # generate uv signal
-    noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-    noise = noise_amp * randn_like(sine_waves)
-    sine_waves = sine_waves * uv + noise
-    return sine_waves, uv, noise
-
-class SourceHnNSF:
-  def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0):
-    self.sine_amp, self.noise_std = sine_amp, add_noise_std
-    self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
-    self.l_linear = nn.Linear(harmonic_num + 1, 1)
-  def forward(self, x, upp=None):
-    sine_waves, uv, _ = self.l_sin_gen.forward(x, upp)
-    sine_merge = self.l_linear(sine_waves.cast(self.l_linear.weight.dtype)).tanh()
-    noise = randn_like(uv) * self.sine_amp / 3
-    return sine_merge, noise, uv
-
-# most of the hifigan in standard vits is reused here, but need to upsample and construct harmonic source from f0
-class Generator:
-  def __init__(self, sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels):
-    self.sampling_rate, self.inter_channels, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.gin_channels = sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels
-    self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
-    self.conv_pre = nn.Conv1d(inter_channels, upsample_initial_channel, 7, 1, padding=3)
-    self.f0_upsamp = Upsample(scale_factor=np.prod(upsample_rates))
-    self.m_source = SourceHnNSF(sampling_rate, harmonic_num=8)
-    resblock = ResBlock1 if resblock == '1' else ResBlock2
-    self.ups, self.noise_convs, self.resblocks = [], [], []
-    for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-      c_cur = upsample_initial_channel//(2**(i+1))
-      self.ups.append(nn.ConvTranspose1d(upsample_initial_channel//(2**i), c_cur, k, u, padding=(k-u)//2))
-      stride_f0 = int(np.prod(upsample_rates[i + 1:]))
-      self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2) if (i + 1 < len(upsample_rates)) else nn.Conv1d(1, c_cur, kernel_size=1))
-    for i in range(len(self.ups)):
-      ch = upsample_initial_channel // (2 ** (i + 1))
-      for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-        self.resblocks.append(resblock(ch, k, d))
-    self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    self.upp = np.prod(upsample_rates)
-  def forward(self, x, f0, g=None):
-    f0 = self.f0_upsamp.forward(f0[:, None]).transpose(1, 2)  # bs,n,t
-    har_source, _, _ = self.m_source.forward(f0, self.upp)
-    har_source = har_source.transpose(1, 2)
-    x = self.conv_pre(x)
-    if g is not None:  x = x + self.cond(g)
-    for i in range(self.num_upsamples):
-      x, xs = self.ups[i](x.leaky_relu(LRELU_SLOPE)), None
-      x_source = self.noise_convs[i](har_source)
-      x = x + x_source
-      for j in range(self.num_kernels):
-        if xs is None: xs = self.resblocks[i * self.num_kernels + j].forward(x)
-        else: xs += self.resblocks[i * self.num_kernels + j].forward(x)
-      x = xs / self.num_kernels
-    return self.conv_post(x.leaky_relu()).tanh()
-
-# **** helpers ****
-
-def randn_like(x:Tensor) -> Tensor: return Tensor.randn(*x.shape, dtype=x.dtype).to(device=x.device)
-
-def tilde(x: Tensor) -> Tensor:
-  if x.dtype == dtypes.bool: return (1 - x).cast(dtypes.bool)
-  return (x + 1) * -1  # this seems to be what the ~ operator does in pytorch for non bool
-
-def lengths_to_padding_mask(lens:Tensor) -> Tensor:
-  bsz, max_lens = lens.shape[0], lens.max().numpy().item()
-  mask = Tensor.arange(max_lens).to(lens.device).reshape(1, max_lens)
-  mask = mask.expand(bsz, -1) >= lens.reshape(bsz, 1).expand(-1, max_lens)
-  return mask.cast(dtypes.bool)
-
-def repeat_expand_2d_left(content, target_len): # content : [h, t]
-  src_len = content.shape[-1]
-  temp = np.arange(src_len+1) * target_len / src_len
-  current_pos, cols = 0, []
-  for i in range(target_len):
-    if i >= temp[current_pos+1]:
-      current_pos += 1
-    cols.append(content[:, current_pos])
-  return Tensor.stack(*cols).transpose(0, 1)
-
-def load_fairseq_cfg(checkpoint_path):
-  assert Path(checkpoint_path).is_file()
-  state = torch_load(checkpoint_path)
-  cfg = state["cfg"] if ("cfg" in state and state["cfg"] is not None) else None
-  if cfg is None: raise RuntimeError(f"No cfg exist in state keys = {state.keys()}")
-  return HParams(**cfg)
-
-def load_checkpoint_enc(checkpoint_path, model: ContentVec, optimizer=None, skip_list=[]):
-  assert Path(checkpoint_path).is_file()
-  start_time = time.time()
-  checkpoint_dict = torch_load(checkpoint_path)
-  saved_state_dict = checkpoint_dict['model']
-  weight_g, weight_v, parent = None, None, None
-  for key, v in saved_state_dict.items():
-    if any(layer in key for layer in skip_list): continue
-    try:
-      obj, skip = model, False
-      for k in key.split('.'):
-        if k.isnumeric(): obj = obj[int(k)]
-        elif isinstance(obj, dict): obj = obj[k]
-        else:
-          if k in ["weight_g", "weight_v"]:
-            parent, skip = obj, True
-            if k == "weight_g": weight_g = v
-            else: weight_v = v
-          if not skip:
-            parent = obj
-            obj = getattr(obj, k)
-      if weight_g and weight_v:
-        setattr(obj, "weight_g", weight_g.numpy())
-        setattr(obj, "weight_v", weight_v.numpy())
-        obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
-        weight_g, weight_v, parent, skip = None, None, None, False
-      if not skip and obj.shape == v.shape:
-        if "feature_extractor" in key and (isinstance(parent, (nn.GroupNorm, nn.LayerNorm))):  # cast
-          obj.assign(v.to(obj.device).float())
-        else:
-          obj.assign(v.to(obj.device))
-      elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
-    except Exception as e: raise e
-  logging.info(f"Loaded checkpoint '{checkpoint_path}' in {time.time() - start_time:.4f}s")
-  return model, optimizer
-
-def pad_array(arr, target_length):
-  current_length = arr.shape[0]
-  if current_length >= target_length: return arr
-  pad_width = target_length - current_length
-  pad_left = pad_width // 2
-  pad_right = pad_width - pad_left
-  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
-  return padded_arr
-
-def split_list_by_n(list_collection, n, pre=0):
-  for i in range(0, len(list_collection), n):
-    yield list_collection[i-pre if i-pre>=0 else i: i + n]
-
-def get_sid(spk2id:HParams, speaker:str) -> Tensor:
-  speaker_id = spk2id[speaker]
-  if not speaker_id and type(speaker) is int:
-    if len(spk2id.__dict__) >= speaker: speaker_id = speaker
-  if speaker_id is None: raise RuntimeError(f"speaker={speaker} not in the speaker list")
-  return Tensor([int(speaker_id)], dtype=dtypes.int64).unsqueeze(0)
-
-def get_encoder(ssl_dim) -> Type[SpeechEncoder]:
-  if ssl_dim == 256: return ContentVec256L9
-  if ssl_dim == 768: return ContentVec768L12
-
-#########################################################################################
-# CODE: https://github.com/svc-develop-team/so-vits-svc
-#########################################################################################
-# CONTENTVEC:
-#   CODE: https://github.com/auspicious3000/contentvec
-#   PAPER: https://arxiv.org/abs/2204.09224
-#########################################################################################
-# INSTALLATION: dependencies are for preprocessing and loading/saving audio.
-# pip3 install soundfile librosa praat-parselmouth
-#########################################################################################
-# EXAMPLE USAGE:
-# python3 examples/so_vits_svc.py --model tf2spy --file ~/recording.wav
-#########################################################################################
-# DEMO USAGE (uses audio sample from LJ-Speech):
-# python3 examples/so_vits_svc.py --model saul_goodman
-#########################################################################################
-SO_VITS_SVC_PATH = Path(__file__).parents[1] / "weights/So-VITS-SVC"
-VITS_MODELS = { # config_path, weights_path, config_url, weights_url
-  "saul_goodman" : (SO_VITS_SVC_PATH / "config_saul_gman.json", SO_VITS_SVC_PATH / "pretrained_saul_gman.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/G_80000.pth"),
-  "drake" : (SO_VITS_SVC_PATH / "config_drake.json", SO_VITS_SVC_PATH / "pretrained_drake.pth", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/config_aubrey.json", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/pretrained_aubrey.pth"),
-  "cartman" : (SO_VITS_SVC_PATH / "config_cartman.json", SO_VITS_SVC_PATH / "pretrained_cartman.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/G_10200.pth"),
-  "tf2spy" : (SO_VITS_SVC_PATH / "config_tf2spy.json", SO_VITS_SVC_PATH / "pretrained_tf2spy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/G_60000.pth"),
-  "tf2heavy" : (SO_VITS_SVC_PATH / "config_tf2heavy.json", SO_VITS_SVC_PATH / "pretrained_tf2heavy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/G_100000.pth"),
-  "lady_gaga" : (SO_VITS_SVC_PATH / "config_gaga.json", SO_VITS_SVC_PATH / "pretrained_gaga.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/G_14400.pth")
-}
-ENCODER_MODELS = { # weights_path, weights_url
-  "contentvec": (SO_VITS_SVC_PATH / "contentvec_checkpoint.pt", "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
-}
-ENCODER_MODEL = "contentvec"
-DEMO_PATH, DEMO_URL = Path(__file__).parents[1] / "temp/LJ037-0171.wav", "https://keithito.com/LJ-Speech-Dataset/LJ037-0171.wav"
-if __name__=="__main__":
-  logging.basicConfig(stream=sys.stdout, level=(logging.INFO if DEBUG < 1 else logging.DEBUG))
-  parser = argparse.ArgumentParser()
-  parser.add_argument("-m", "--model", default=None, help=f"Specify the model to use. All supported models: {VITS_MODELS.keys()}", required=True)
-  parser.add_argument("-f", "--file", default=DEMO_PATH, help=f"Specify the path of the input file")
-  parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
-  parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
-  parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
-  parser.add_argument("--speaker", default=None, help="If not specified, the first available speaker is chosen. Usually there is only one speaker per model.")
-  parser.add_argument("--noise_scale", default=0.4)
-  parser.add_argument("--tran", default=0.0, help="Pitch shift, supports positive and negative (semitone) values. Default 0.0")
-  parser.add_argument("--pad_seconds", default=0.5)
-  parser.add_argument("--lg_num", default=0.0)
-  parser.add_argument("--clip_seconds", default=0.0)
-  parser.add_argument("--slice_db", default=-40)
-  args = parser.parse_args()
-
-  vits_model = args.model
-  encoder_location, vits_location = ENCODER_MODELS[ENCODER_MODEL], VITS_MODELS[vits_model]
-
-  Tensor.training = False
-  # Get Synthesizer and ContentVec
-  net_g, hps = Synthesizer.load_from_pretrained(vits_location[0], vits_location[2], vits_location[1], vits_location[3])
-  Encoder = get_encoder(hps.model.ssl_dim)
-  encoder = Encoder.load_from_pretrained(encoder_location[0], encoder_location[1])
-
-  # model config args
-  target_sample, spk2id, hop_length, target_sample = hps.data.sampling_rate, hps.spk, hps.data.hop_length, hps.data.sampling_rate
-  vol_embedding = hps.model.vol_embedding if hasattr(hps.data, "vol_embedding") and hps.model.vol_embedding is not None else False
-
-  # args
-  slice_db, clip_seconds, lg_num, pad_seconds, tran, noise_scale, audio_path = args.slice_db, args.clip_seconds, args.lg_num, args.pad_seconds, args.tran, args.noise_scale, args.file
-  speaker = args.speaker if args.speaker is not None else list(hps.spk.__dict__.keys())[0]
-
-  ### Loading audio and slicing ###
-  if audio_path == DEMO_PATH: fetch(DEMO_URL, DEMO_PATH)
-  assert Path(audio_path).is_file() and Path(audio_path).suffix == ".wav"
-  chunks = preprocess.cut(audio_path, db_thresh=slice_db)
-  audio_data, audio_sr = preprocess.chunks2audio(audio_path, chunks)
-
-  per_size = int(clip_seconds * audio_sr)
-  lg_size = int(lg_num * audio_sr)
-
-  ### Infer per slice ###
-  global_frame = 0
-  audio = []
-  for (slice_tag, data) in audio_data:
-    print(f"\n====segment start, {round(len(data) / audio_sr, 3)}s====")
-    length = int(np.ceil(len(data) / audio_sr * target_sample))
-
-    if slice_tag:
-      print("empty segment")
-      _audio = np.zeros(length)
-      audio.extend(list(pad_array(_audio, length)))
-      global_frame += length // hop_length
-      continue
-
-    datas = [data] if per_size == 0 else split_list_by_n(data, per_size, lg_size)
-
-    for k, dat in enumerate(datas):
-      per_length = int(np.ceil(len(dat) / audio_sr * target_sample)) if clip_seconds!=0 else length
-      pad_len = int(audio_sr * pad_seconds)
-      dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
-      raw_path = io.BytesIO()
-      soundfile.write(raw_path, dat, audio_sr, format="wav")
-      raw_path.seek(0)
-
-      ### Infer START ###
-      wav, sr = preprocess.load_audiofile(raw_path)
-      wav = preprocess.sinc_interp_resample(wav, sr, target_sample)[0]
-      wav16k, f0, uv = preprocess.get_unit_f0(wav, tran, hop_length, target_sample)
-      sid = get_sid(spk2id, speaker)
-      n_frames = f0.shape[1]
-
-      # ContentVec infer
-      start = time.time()
-      c = encoder.encode(wav16k)
-      c = repeat_expand_2d_left(c.squeeze(0).realize(), f0.shape[1])  # interpolate speech encoding to match f0
-      c = c.unsqueeze(0).realize()
-      enc_time = time.time() - start
-
-      # VITS infer
-      vits_start = time.time()
-      out_audio, f0 = net_g.infer(c, f0=f0, uv=uv, g=sid, noise_scale=noise_scale, vol=None)
-      out_audio = out_audio[0,0].float().realize()
-      vits_time = time.time() - vits_start
-
-      infer_time = time.time() - start
-      logging.info("total infer time:{:.2f}s, speech_enc time:{:.2f}s, vits time:{:.2f}s".format(infer_time, enc_time, vits_time))
-      ### Infer END ###
-
-      out_sr, out_frame = out_audio.shape[-1], n_frames
-      global_frame += out_frame
-      _audio = out_audio.numpy()
-      pad_len = int(target_sample * pad_seconds)
-      _audio = _audio[pad_len:-pad_len]
-      _audio = pad_array(_audio, per_length)
-      audio.extend(list(_audio))
-
-  audio = np.array(audio)
-  out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model}{f'_spk_{speaker}'}_{args.base_name}.wav")
-  out_path.parent.mkdir(parents=True, exist_ok=True)
-  soundfile.write(out_path, audio, target_sample, format="flac")
-  logging.info(f"Saved audio output to {out_path}")
--- a/examples/sovits_helpers/preprocess.py
+++ b/examples/sovits_helpers/preprocess.py
@@ -1,204 +0,0 @@
-import math
-from typing import Optional, Tuple
-from tinygrad import Tensor, dtypes
-import librosa
-import soundfile
-import numpy as np
-import parselmouth
-
-class PMF0Predictor:  # from https://github.com/svc-develop-team/so-vits-svc/
-  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
-    self.hop_length, self.f0_min, self.f0_max, self.sampling_rate, self.name = hop_length, f0_min, f0_max, sampling_rate, "pm"
-  def interpolate_f0(self,f0):
-    vuv_vector = np.zeros_like(f0, dtype=np.float32)
-    vuv_vector[f0 > 0.0] = 1.0
-    vuv_vector[f0 <= 0.0] = 0.0
-    nzindex = np.nonzero(f0)[0]
-    data = f0[nzindex]
-    nzindex = nzindex.astype(np.float32)
-    time_org = self.hop_length / self.sampling_rate * nzindex
-    time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
-    if data.shape[0] <= 0: return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
-    if data.shape[0] == 1: return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
-    f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
-    return f0,vuv_vector
-  def compute_f0(self,wav,p_len=None):
-    x = wav
-    if p_len is None: p_len = x.shape[0]//self.hop_length
-    else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
-    time_step = self.hop_length / self.sampling_rate * 1000
-    f0 = parselmouth.Sound(x, self.sampling_rate) \
-                    .to_pitch_ac(time_step=time_step / 1000, voicing_threshold=0.6,pitch_floor=self.f0_min, pitch_ceiling=self.f0_max) \
-                    .selected_array['frequency']
-    pad_size=(p_len - len(f0) + 1) // 2
-    if(pad_size>0 or p_len - len(f0) - pad_size>0):
-      f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
-    f0,uv = self.interpolate_f0(f0)
-    return f0
-  def compute_f0_uv(self,wav,p_len=None):
-    x = wav
-    if p_len is None: p_len = x.shape[0]//self.hop_length
-    else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
-    time_step = self.hop_length / self.sampling_rate * 1000
-    f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac(
-      time_step=time_step / 1000, voicing_threshold=0.6,
-      pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
-    pad_size=(p_len - len(f0) + 1) // 2
-    if(pad_size>0 or p_len - len(f0) - pad_size>0):
-      f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
-    f0,uv = self.interpolate_f0(f0)
-    return f0,uv
-
-class Slicer:  # from https://github.com/svc-develop-team/so-vits-svc/
-  def __init__(self, sr: int, threshold: float = -40., min_length: int = 5000, min_interval: int = 300, hop_size: int = 20, max_sil_kept: int = 5000):
-    if not min_length >= min_interval >= hop_size:
-      raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
-    if not max_sil_kept >= hop_size:
-      raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
-    min_interval = sr * min_interval / 1000
-    self.threshold = 10 ** (threshold / 20.)
-    self.hop_size = round(sr * hop_size / 1000)
-    self.win_size = min(round(min_interval), 4 * self.hop_size)
-    self.min_length = round(sr * min_length / 1000 / self.hop_size)
-    self.min_interval = round(min_interval / self.hop_size)
-    self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
-  def _apply_slice(self, waveform, begin, end):
-    if len(waveform.shape) > 1: return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
-    else: return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
-  def slice(self, waveform):
-    samples = librosa.to_mono(waveform) if len(waveform.shape) > 1 else waveform
-    if samples.shape[0] <= self.min_length: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
-    rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
-    sil_tags, silence_start, clip_start = [], None, 0
-    for i, rms in enumerate(rms_list):
-      if rms < self.threshold:  # Keep looping while frame is silent.
-        if silence_start is None:  # Record start of silent frames.
-          silence_start = i
-        continue
-      if silence_start is None: continue  # Keep looping while frame is not silent and silence start has not been recorded.
-      # Clear recorded silence start if interval is not enough or clip is too short
-      is_leading_silence = silence_start == 0 and i > self.max_sil_kept
-      need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
-      if not is_leading_silence and not need_slice_middle:
-        silence_start = None
-        continue
-      if i - silence_start <= self.max_sil_kept:  # Need slicing. Record the range of silent frames to be removed.
-        pos = rms_list[silence_start: i + 1].argmin() + silence_start
-        sil_tags.append((0, pos) if silence_start == 0 else (pos, pos))
-        clip_start = pos
-      elif i - silence_start <= self.max_sil_kept * 2:
-        pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
-        pos += i - self.max_sil_kept
-        pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
-        pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
-        if silence_start == 0:
-          sil_tags.append((0, pos_r))
-          clip_start = pos_r
-        else:
-          sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
-          clip_start = max(pos_r, pos)
-      else:
-        pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
-        pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
-        sil_tags.append((0, pos_r) if silence_start == 0 else (pos_l, pos_r))
-        clip_start = pos_r
-      silence_start = None
-    total_frames = rms_list.shape[0]
-    if silence_start is not None and total_frames - silence_start >= self.min_interval:  # Deal with trailing silence.
-      silence_end = min(total_frames, silence_start + self.max_sil_kept)
-      pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
-      sil_tags.append((pos, total_frames + 1))
-    if len(sil_tags) == 0: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}  # Apply and return slices.
-    chunks = []
-    if sil_tags[0][0]:
-      chunks.append({"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
-    for i in range(0, len(sil_tags)):
-      if i: chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
-      chunks.append({"slice": True, "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
-    if sil_tags[-1][1] * self.hop_size < len(waveform):
-      chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
-    chunk_dict = {}
-    for i in range(len(chunks)): chunk_dict[str(i)] = chunks[i]
-    return chunk_dict
-
-# sinc_interp_hann audio resampling
-class Resample:
-  def __init__(self, orig_freq:int=16000, new_freq:int=16000, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None, dtype:Optional[dtypes]=None):
-    self.orig_freq, self.new_freq, self.lowpass_filter_width, self.rolloff, self.beta = orig_freq, new_freq, lowpass_filter_width, rolloff, beta
-    self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
-    self.kernel, self.width = self._get_sinc_resample_kernel(dtype) if self.orig_freq != self.new_freq else (None, None)
-  def __call__(self, waveform:Tensor) -> Tensor:
-    if self.orig_freq == self.new_freq: return waveform
-    return self._apply_sinc_resample_kernel(waveform)
-  def _apply_sinc_resample_kernel(self, waveform:Tensor):
-    if not waveform.is_floating_point(): raise TypeError(f"Waveform tensor expected to be of type float, but received {waveform.dtype}.")
-    orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
-    shape = waveform.shape
-    waveform = waveform.reshape(-1, shape[-1])  # pack batch
-    num_wavs, length = waveform.shape
-    target_length = int(math.ceil(new_freq * length / orig_freq))
-    waveform = waveform.pad((self.width, self.width + orig_freq))
-    resampled = waveform[:, None].conv2d(self.kernel, stride=orig_freq)
-    resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
-    resampled = resampled[..., :target_length]
-    resampled = resampled.reshape(shape[:-1] + resampled.shape[-1:])  # unpack batch
-    return resampled
-  def _get_sinc_resample_kernel(self, dtype=None):
-    orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
-    if self.lowpass_filter_width <= 0: raise ValueError("Low pass filter width should be positive.")
-    base_freq = min(orig_freq, new_freq)
-    base_freq *= self.rolloff
-    width = math.ceil(self.lowpass_filter_width * orig_freq / base_freq)
-    idx = Tensor.arange(-width, width + orig_freq, dtype=(dtype if dtype is not None else dtypes.float32))[None, None] / orig_freq
-    t = Tensor.arange(0, -new_freq, -1, dtype=dtype)[:, None, None] / new_freq + idx
-    t *= base_freq
-    t = t.clip(-self.lowpass_filter_width, self.lowpass_filter_width)
-    window = (t * math.pi / self.lowpass_filter_width / 2).cos() ** 2
-    t *= math.pi
-    scale = base_freq / orig_freq
-    kernels = Tensor.where(t == 0, Tensor(1.0, dtype=t.dtype).to(t.device), t.sin() / t)
-    kernels *= window * scale
-    if dtype is None: kernels = kernels.cast(dtype=dtypes.float32)
-    return kernels, width
-
-def sinc_interp_resample(x:Tensor, orig_freq:int=16000, new_freq:int=1600, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None):
-  resamp = Resample(orig_freq, new_freq, lowpass_filter_width, rolloff, beta, x.dtype)
-  return resamp(x)
-
-def cut(audio_path, db_thresh=-30, min_len=5000):
-  audio, sr = librosa.load(audio_path, sr=None)
-  slicer = Slicer(sr=sr, threshold=db_thresh, min_length=min_len)
-  chunks = slicer.slice(audio)
-  return chunks
-
-def chunks2audio(audio_path, chunks):
-  chunks = dict(chunks)
-  audio, sr = load_audiofile(audio_path)
-  if len(audio.shape) == 2 and audio.shape[1] >= 2:
-    audio = audio.mean(0).unsqueeze(0)
-  audio = audio.numpy()[0]
-  result = []
-  for k, v in chunks.items():
-    tag = v["split_time"].split(",")
-    if tag[0] != tag[1]:
-      result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
-  return result, sr
-
-def load_audiofile(filepath:str, frame_offset:int=0, num_frames:int=-1, channels_first:bool=True):
-  with soundfile.SoundFile(filepath, "r") as file_:
-    frames = file_._prepare_read(frame_offset, None, num_frames)
-    waveform = file_.read(frames, "float32", always_2d=True)
-    sample_rate = file_.samplerate
-  waveform = Tensor(waveform)
-  if channels_first: waveform = waveform.transpose(0, 1)
-  return waveform, sample_rate
-
-def get_unit_f0(wav:Tensor, tran, hop_length, target_sample, f0_filter=False) -> Tuple[Tensor,Tensor,Tensor]:
-  f0_predictor = PMF0Predictor(hop_length, sampling_rate=target_sample)
-  f0, uv = f0_predictor.compute_f0_uv(wav.numpy())
-  if f0_filter and sum(f0) == 0: raise RuntimeError("No voice detected")
-  f0 = Tensor(f0.astype(np.float32)).float()
-  f0 = (f0 * 2 ** (tran / 12)).unsqueeze(0)
-  uv = Tensor(uv.astype(np.float32)).float().unsqueeze(0)
-  wav16k = sinc_interp_resample(wav[None,:], target_sample, 16000)[0]
-  return wav16k.realize(), f0.realize(), uv.realize()
--- a/examples/tools/bandwidth_test.py
+++ b/examples/tools/bandwidth_test.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+from tinygrad import Tensor, Device, GlobalCounters, Context, dtypes
+from tinygrad.helpers import getenv, colored
+
+SZ = 8_000_000_000
+GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
+
+if __name__ == "__main__":
+  # create tensors
+  tens = [Tensor.ones(SZ, dtype=dtypes.uint8, device=f"{Device.DEFAULT}:{i}").contiguous() for i in range(GPUS)]
+  Tensor.realize(*tens)
+
+  bw = [[0.0]*GPUS for _ in range(GPUS)]
+  for i in range(GPUS):
+    for j in range(GPUS):
+      GlobalCounters.reset()
+      with Context(DEBUG=2):
+        if i == j:
+          # this copy would be optimized out, just add 1
+          (tens[i]+1).realize()
+        else:
+          tens[i].to(f"{Device.DEFAULT}:{j}").realize()
+      t = max(GlobalCounters.time_sum_s, 1e-9)
+      bw[i][j] = SZ / t / 1e9  # GB/s
+
+  def fmt(x):
+    c = "green" if x > 50 else "yellow" if x > 20 else "red"
+    return colored(f"{x:6.1f}", c)
+
+  # header
+  print(" " * 8 + " ".join(f"{'d'+str(j):>6}" for j in range(GPUS)))
+  # rows
+  for i in range(GPUS):
+    print(f"{'s'+str(i):>6} -> " + " ".join(fmt(x) for x in bw[i]))
--- a/examples/tools/gpuburn.py
+++ b/examples/tools/gpuburn.py
@@ -0,0 +1,16 @@
+from tinygrad import Tensor, Device, TinyJit, dtypes
+from tinygrad.helpers import getenv
+
+GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
+N = 6144
+
+@TinyJit
+def many_matmul(A, B):
+  out = A
+  for _ in range(8): out = out@B
+  return out
+
+if __name__ == "__main__":
+  A = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
+  B = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
+  while 1: many_matmul(A, B)
--- a/examples/train_efficientnet.py
+++ b/examples/train_efficientnet.py
@@ -1,104 +0,0 @@
-import traceback
-import time
-from multiprocessing import Process, Queue
-import numpy as np
-from tinygrad.nn.state import get_parameters
-from tinygrad.nn import optim
-from tinygrad.helpers import getenv, trange
-from tinygrad.tensor import Tensor
-from extra.datasets import fetch_cifar
-from extra.models.efficientnet import EfficientNet
-
-class TinyConvNet:
-  def __init__(self, classes=10):
-    conv = 3
-    inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*6*6, classes)
-
-  def forward(self, x):
-    x = x.conv2d(self.c1).relu().max_pool2d()
-    x = x.conv2d(self.c2).relu().max_pool2d()
-    x = x.reshape(shape=[x.shape[0], -1])
-    return x.dot(self.l1)
-
-if __name__ == "__main__":
-  IMAGENET = getenv("IMAGENET")
-  classes = 1000 if IMAGENET else 10
-
-  TINY = getenv("TINY")
-  TRANSFER = getenv("TRANSFER")
-  if TINY:
-    model = TinyConvNet(classes)
-  elif TRANSFER:
-    model = EfficientNet(getenv("NUM", 0), classes, has_se=True)
-    model.load_from_pretrained()
-  else:
-    model = EfficientNet(getenv("NUM", 0), classes, has_se=False)
-
-  parameters = get_parameters(model)
-  print("parameter count", len(parameters))
-  optimizer = optim.Adam(parameters, lr=0.001)
-
-  BS, steps = getenv("BS", 64 if TINY else 16), getenv("STEPS", 2048)
-  print(f"training with batch size {BS} for {steps} steps")
-
-  if IMAGENET:
-    from extra.datasets.imagenet import fetch_batch
-    def loader(q):
-      while 1:
-        try:
-          q.put(fetch_batch(BS))
-        except Exception:
-          traceback.print_exc()
-    q = Queue(16)
-    for i in range(2):
-      p = Process(target=loader, args=(q,))
-      p.daemon = True
-      p.start()
-  else:
-    X_train, Y_train, _, _ = fetch_cifar()
-    X_train = X_train.reshape((-1, 3, 32, 32))
-    Y_train = Y_train.reshape((-1,))
-
-  with Tensor.train():
-    for i in (t := trange(steps)):
-      if IMAGENET:
-        X, Y = q.get(True)
-      else:
-        samp = np.random.randint(0, X_train.shape[0], size=(BS))
-        X, Y = X_train.numpy()[samp], Y_train.numpy()[samp]
-
-      st = time.time()
-      out = model.forward(Tensor(X.astype(np.float32), requires_grad=False))
-      fp_time = (time.time()-st)*1000.0
-
-      y = np.zeros((BS,classes), np.float32)
-      y[range(y.shape[0]),Y] = -classes
-      y = Tensor(y, requires_grad=False)
-      loss = out.log_softmax().mul(y).mean()
-
-      optimizer.zero_grad()
-
-      st = time.time()
-      loss.backward()
-      bp_time = (time.time()-st)*1000.0
-
-      st = time.time()
-      optimizer.step()
-      opt_time = (time.time()-st)*1000.0
-
-      st = time.time()
-      loss = loss.numpy()
-      cat = out.argmax(axis=1).numpy()
-      accuracy = (cat == Y).mean()
-      finish_time = (time.time()-st)*1000.0
-
-      # printing
-      t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
-        (loss, accuracy,
-        fp_time, bp_time, opt_time, finish_time,
-        fp_time + bp_time + opt_time + finish_time))
-
-      del out, y, loss
--- a/examples/vit.py
+++ b/examples/vit.py
@@ -1,46 +0,0 @@
-import ast
-import numpy as np
-from PIL import Image
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import getenv, fetch
-from extra.models.vit import ViT
-"""
-fn = "gs://vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz"
-import tensorflow as tf
-with tf.io.gfile.GFile(fn, "rb") as f:
-  dat = f.read()
-  with open("cache/"+ fn.rsplit("/", 1)[1], "wb") as g:
-    g.write(dat)
-"""
-
-Tensor.training = False
-if getenv("LARGE", 0) == 1:
-  m = ViT(embed_dim=768, num_heads=12)
-else:
-  # tiny
-  m = ViT(embed_dim=192, num_heads=3)
-m.load_from_pretrained()
-
-# category labels
-lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
-
-#url = "https://upload.wikimedia.org/wikipedia/commons/4/41/Chicken.jpg"
-url = "https://repository-images.githubusercontent.com/296744635/39ba6700-082d-11eb-98b8-cb29fb7369c0"
-
-# junk
-img = Image.open(fetch(url))
-aspect_ratio = img.size[0] / img.size[1]
-img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
-img = np.array(img)
-y0,x0=(np.asarray(img.shape)[:2]-224)//2
-img = img[y0:y0+224, x0:x0+224]
-img = np.moveaxis(img, [2,0,1], [0,1,2])
-img = img.astype(np.float32)[:3].reshape(1,3,224,224)
-img /= 255.0
-img -= 0.5
-img /= 0.5
-
-out = m.forward(Tensor(img))
-outnp = out.numpy().ravel()
-choice = outnp.argmax()
-print(out.shape, choice, outnp[choice], lbls[choice])