cleanup stale examples/extra (#13764)

* cleanup stale files

* examples

* move those back

* old

* delete more
This commit is contained in:
George Hotz
2025-12-19 16:27:37 -04:00
committed by GitHub
parent 80b84f5267
commit df6cde8a00
45 changed files with 0 additions and 5039 deletions

View File

@@ -1,93 +0,0 @@
#!/usr/bin/env python3
import os, sys, traceback
sys.path.append(os.getcwd())
from io import StringIO
from contextlib import redirect_stdout
from tinygrad import Tensor, nn
from tinygrad.helpers import Timing, colored, getenv, fetch
from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
from sentencepiece import SentencePieceProcessor
def create_fixed_tokenizer(output_file):
print("creating fixed tokenizer")
import extra.junk.sentencepiece_model_pb2 as spb2
mp = spb2.ModelProto()
mp.ParseFromString(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/tokenizer.model?download=true").read_bytes())
mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
with open(output_file, "wb") as f:
f.write(mp.SerializeToString())
# example:
# echo -en "write 2+2\nwrite hello world\ny\n" | TEMP=0 python3 examples/coder.py
if __name__ == "__main__":
# https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
with Timing("create model: "):
model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096, jit=getenv("JIT", 1))
with Timing("download weights: "):
part1 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00001-of-00002.bin?download=true"))
part2 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00002-of-00002.bin?download=true"))
with Timing("weights -> model: "):
nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part1, 32, 32, 8)), strict=False)
nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part2, 32, 32, 8)), strict=False)
if not os.path.isfile("/tmp/tokenizer.model"): create_fixed_tokenizer("/tmp/tokenizer.model")
spp = SentencePieceProcessor(model_file="/tmp/tokenizer.model")
# https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/tokenizer_config.json
# "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
IM_END = 32000
IM_START = 32001
def encode_prompt(k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
def start_prompt(k): return [IM_START]+spp.encode(f"{k}\n")
def output(outputted, toks, color):
cur = spp.decode(toks)[len(outputted):]
sys.stdout.write(colored(cur, color))
sys.stdout.flush()
outputted += cur
return outputted
# *** app below this line ***
toks = [spp.bos_id()] + encode_prompt("system", "You are Quentin. Quentin is a useful assistant who writes Python code to answer questions. He keeps the code as short as possible and doesn't read from user input")
PROMPT = getenv("PROMPT", 1)
temperature = getenv("TEMP", 0.7)
start_pos = 0
outputted = output("", toks, "green")
turn = True
while 1:
if PROMPT:
toks += encode_prompt("user", input("Q: ")) + start_prompt("assistant")
else:
toks += start_prompt("user" if turn else "assistant")
turn = not turn
old_output_len = len(outputted)
while 1:
tok = model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
start_pos = len(toks)
toks.append(tok)
outputted = output(outputted, toks, "blue" if not turn else "cyan")
if tok == IM_END: break
if tok == spp.eos_id(): break
new_output = outputted[old_output_len:]
if new_output.endswith("```") and '```python\n' in new_output:
python_code = new_output.split('```python\n')[1].split("```")[0]
# AI safety. Warning to user. Do not press y if the AI is trying to do unsafe things.
if input(colored(f" <-- PYTHON DETECTED, RUN IT? ", "red")).lower() == 'y':
my_stdout = StringIO()
try:
with redirect_stdout(my_stdout): exec(python_code)
result = my_stdout.getvalue()
except Exception as e:
result = ''.join(traceback.format_exception_only(e))
toks += spp.encode(f"\nOutput:\n```\n{result}```")
outputted = output(outputted, toks, "yellow")
old_output_len = len(outputted)
print("")

View File

@@ -1,89 +0,0 @@
# load weights from
# https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth
# a rough copy of
# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
import sys
import ast
import time
import numpy as np
from PIL import Image
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv, fetch, Timing
from tinygrad.engine.jit import TinyJit
from extra.models.efficientnet import EfficientNet
np.set_printoptions(suppress=True)
# TODO: you should be able to put these in the jitted function
bias = Tensor([0.485, 0.456, 0.406])
scale = Tensor([0.229, 0.224, 0.225])
@TinyJit
def _infer(model, img):
img = img.permute((2,0,1))
img = img / 255.0
img = img - bias.reshape((1,-1,1,1))
img = img / scale.reshape((1,-1,1,1))
return model.forward(img).realize()
def infer(model, img):
# preprocess image
aspect_ratio = img.size[0] / img.size[1]
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
img = np.array(img)
y0,x0=(np.asarray(img.shape)[:2]-224)//2
retimg = img = img[y0:y0+224, x0:x0+224]
# if you want to look at the image
"""
import matplotlib.pyplot as plt
plt.imshow(img)
plt.show()
"""
# run the net
out = _infer(model, Tensor(img.astype("float32"))).numpy()
# if you want to look at the outputs
"""
import matplotlib.pyplot as plt
plt.plot(out[0])
plt.show()
"""
return out, retimg
if __name__ == "__main__":
# instantiate my net
model = EfficientNet(getenv("NUM", 0))
model.load_from_pretrained()
# category labels
lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
# load image and preprocess
url = sys.argv[1] if len(sys.argv) >= 2 else "https://raw.githubusercontent.com/tinygrad/tinygrad/master/docs/showcase/stable_diffusion_by_tinygrad.jpg"
if url == 'webcam':
import cv2
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
while 1:
_ = cap.grab() # discard one frame to circumvent capture buffering
ret, frame = cap.read()
img = Image.fromarray(frame[:, :, [2,1,0]])
lt = time.monotonic_ns()
out, retimg = infer(model, img)
print(f"{(time.monotonic_ns()-lt)*1e-6:7.2f} ms", np.argmax(out), np.max(out), lbls[np.argmax(out)])
SCALE = 3
simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
cv2.imshow('capture', retimg)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
else:
img = Image.open(fetch(url))
for i in range(getenv("CNT", 1)):
with Timing("did inference in "):
out, _ = infer(model, img)
print(np.argmax(out), np.max(out), lbls[np.argmax(out)])

View File

@@ -1,498 +0,0 @@
# pip3 install sentencepiece
# This file incorporates code from the following:
# Github Name | License | Link
# black-forest-labs/flux | Apache | https://github.com/black-forest-labs/flux/tree/main/model_licenses
from tinygrad import Tensor, nn, dtypes, TinyJit
from tinygrad.nn.state import safe_load, load_state_dict
from tinygrad.helpers import fetch, tqdm, colored
from sdxl import FirstStage
from extra.models.clip import FrozenClosedClipEmbedder
from extra.models.t5 import T5Embedder
import numpy as np
import math, time, argparse, tempfile
from typing import List, Dict, Optional, Union, Tuple, Callable
from dataclasses import dataclass
from pathlib import Path
from PIL import Image
urls:dict = {
"flux-schnell": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/flux1-schnell.safetensors",
"flux-dev": "https://huggingface.co/camenduru/FLUX.1-dev/resolve/main/flux1-dev.sft",
"ae": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/ae.safetensors",
"T5_1_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00001-of-00002.safetensors",
"T5_2_of_2": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder_2/model-00002-of-00002.safetensors",
"T5_tokenizer": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/tokenizer_2/spiece.model",
"clip": "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/text_encoder/model.safetensors"
}
def tensor_identity(x:Tensor) -> Tensor: return x
class AutoEncoder:
def __init__(self, scale_factor:float, shift_factor:float):
self.decoder = FirstStage.Decoder(128, 3, 3, 16, [1, 2, 4, 4], 2, 256)
self.scale_factor = scale_factor
self.shift_factor = shift_factor
def decode(self, z:Tensor) -> Tensor:
z = z / self.scale_factor + self.shift_factor
return self.decoder(z)
# Conditioner
class ClipEmbedder(FrozenClosedClipEmbedder):
def __call__(self, texts:Union[str, List[str], Tensor]) -> Tensor:
if isinstance(texts, str): texts = [texts]
assert isinstance(texts, (list,tuple)), f"expected list of strings, got {type(texts).__name__}"
tokens = Tensor.cat(*[Tensor(self.tokenizer.encode(text)) for text in texts], dim=0)
return self.transformer.text_model(tokens.reshape(len(texts),-1))[:, tokens.argmax(-1)]
# https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
def attention(q:Tensor, k:Tensor, v:Tensor, pe:Tensor) -> Tensor:
q, k = apply_rope(q, k, pe)
x = Tensor.scaled_dot_product_attention(q, k, v)
return x.rearrange("B H L D -> B L (H D)")
def rope(pos:Tensor, dim:int, theta:int) -> Tensor:
assert dim % 2 == 0
scale = Tensor.arange(0, dim, 2, dtype=dtypes.float32, device=pos.device) / dim # NOTE: this is torch.float64 in reference implementation
omega = 1.0 / (theta**scale)
out = Tensor.einsum("...n,d->...nd", pos, omega)
out = Tensor.stack(Tensor.cos(out), -Tensor.sin(out), Tensor.sin(out), Tensor.cos(out), dim=-1)
out = out.rearrange("b n d (i j) -> b n d i j", i=2, j=2)
return out.float()
def apply_rope(xq:Tensor, xk:Tensor, freqs_cis:Tensor) -> Tuple[Tensor, Tensor]:
xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
return xq_out.reshape(*xq.shape).cast(xq.dtype), xk_out.reshape(*xk.shape).cast(xk.dtype)
# https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
class EmbedND:
def __init__(self, dim:int, theta:int, axes_dim:List[int]):
self.dim = dim
self.theta = theta
self.axes_dim = axes_dim
def __call__(self, ids:Tensor) -> Tensor:
n_axes = ids.shape[-1]
emb = Tensor.cat(*[rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3)
return emb.unsqueeze(1)
class MLPEmbedder:
def __init__(self, in_dim:int, hidden_dim:int):
self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
def __call__(self, x:Tensor) -> Tensor:
return self.out_layer(self.in_layer(x).silu())
class QKNorm:
def __init__(self, dim:int):
self.query_norm = nn.RMSNorm(dim)
self.key_norm = nn.RMSNorm(dim)
def __call__(self, q:Tensor, k:Tensor) -> Tuple[Tensor, Tensor]:
return self.query_norm(q), self.key_norm(k)
class SelfAttention:
def __init__(self, dim:int, num_heads:int = 8, qkv_bias:bool = False):
self.num_heads = num_heads
head_dim = dim // num_heads
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.norm = QKNorm(head_dim)
self.proj = nn.Linear(dim, dim)
def __call__(self, x:Tensor, pe:Tensor) -> Tensor:
qkv = self.qkv(x)
q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
q, k = self.norm(q, k)
x = attention(q, k, v, pe=pe)
return self.proj(x)
@dataclass
class ModulationOut:
shift:Tensor
scale:Tensor
gate:Tensor
class Modulation:
def __init__(self, dim:int, double:bool):
self.is_double = double
self.multiplier = 6 if double else 3
self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
def __call__(self, vec:Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
out = self.lin(vec.silu())[:, None, :].chunk(self.multiplier, dim=-1)
return ModulationOut(*out[:3]), ModulationOut(*out[3:]) if self.is_double else None
class DoubleStreamBlock:
def __init__(self, hidden_size:int, num_heads:int, mlp_ratio:float, qkv_bias:bool = False):
mlp_hidden_dim = int(hidden_size * mlp_ratio)
self.num_heads = num_heads
self.hidden_size = hidden_size
self.img_mod = Modulation(hidden_size, double=True)
self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.img_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
self.txt_mod = Modulation(hidden_size, double=True)
self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.txt_mlp = [nn.Linear(hidden_size, mlp_hidden_dim, bias=True), Tensor.gelu, nn.Linear(mlp_hidden_dim, hidden_size, bias=True)]
def __call__(self, img:Tensor, txt:Tensor, vec:Tensor, pe:Tensor) -> tuple[Tensor, Tensor]:
img_mod1, img_mod2 = self.img_mod(vec)
txt_mod1, txt_mod2 = self.txt_mod(vec)
assert img_mod2 is not None and txt_mod2 is not None
# prepare image for attention
img_modulated = self.img_norm1(img)
img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
img_qkv = self.img_attn.qkv(img_modulated)
img_q, img_k, img_v = img_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
img_q, img_k = self.img_attn.norm(img_q, img_k)
# prepare txt for attention
txt_modulated = self.txt_norm1(txt)
txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
txt_qkv = self.txt_attn.qkv(txt_modulated)
txt_q, txt_k, txt_v = txt_qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k)
# run actual attention
q = Tensor.cat(txt_q, img_q, dim=2)
k = Tensor.cat(txt_k, img_k, dim=2)
v = Tensor.cat(txt_v, img_v, dim=2)
attn = attention(q, k, v, pe=pe)
txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
# calculate the img bloks
img = img + img_mod1.gate * self.img_attn.proj(img_attn)
img = img + img_mod2.gate * ((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift).sequential(self.img_mlp)
# calculate the txt bloks
txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
txt = txt + txt_mod2.gate * ((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift).sequential(self.txt_mlp)
return img, txt
class SingleStreamBlock:
"""
A DiT block with parallel linear layers as described in
https://arxiv.org/abs/2302.05442 and adapted modulation interface.
"""
def __init__(self,hidden_size:int, num_heads:int, mlp_ratio:float=4.0, qk_scale:Optional[float]=None):
self.hidden_dim = hidden_size
self.num_heads = num_heads
head_dim = hidden_size // num_heads
self.scale = qk_scale or head_dim**-0.5
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
# qkv and mlp_in
self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
# proj and mlp_out
self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
self.norm = QKNorm(head_dim)
self.hidden_size = hidden_size
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.mlp_act = Tensor.gelu
self.modulation = Modulation(hidden_size, double=False)
def __call__(self, x:Tensor, vec:Tensor, pe:Tensor) -> Tensor:
mod, _ = self.modulation(vec)
x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
qkv, mlp = Tensor.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
q, k, v = qkv.rearrange("B L (K H D) -> K B H L D", K=3, H=self.num_heads)
q, k = self.norm(q, k)
# compute attention
attn = attention(q, k, v, pe=pe)
# compute activation in mlp stream, cat again and run second linear layer
output = self.linear2(Tensor.cat(attn, self.mlp_act(mlp), dim=2))
return x + mod.gate * output
class LastLayer:
def __init__(self, hidden_size:int, patch_size:int, out_channels:int):
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
self.adaLN_modulation:List[Callable[[Tensor], Tensor]] = [Tensor.silu, nn.Linear(hidden_size, 2 * hidden_size, bias=True)]
def __call__(self, x:Tensor, vec:Tensor) -> Tensor:
shift, scale = vec.sequential(self.adaLN_modulation).chunk(2, dim=1)
x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
return self.linear(x)
def timestep_embedding(t:Tensor, dim:int, max_period:int=10000, time_factor:float=1000.0) -> Tensor:
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
t = time_factor * t
half = dim // 2
freqs = Tensor.exp(-math.log(max_period) * Tensor.arange(0, stop=half, dtype=dtypes.float32) / half).to(t.device)
args = t[:, None].float() * freqs[None]
embedding = Tensor.cat(Tensor.cos(args), Tensor.sin(args), dim=-1)
if dim % 2: embedding = Tensor.cat(*[embedding, Tensor.zeros_like(embedding[:, :1])], dim=-1)
if Tensor.is_floating_point(t): embedding = embedding.cast(t.dtype)
return embedding
# https://github.com/black-forest-labs/flux/blob/main/src/flux/model.py
class Flux:
"""
Transformer model for flow matching on sequences.
"""
def __init__(
self,
guidance_embed:bool,
in_channels:int = 64,
vec_in_dim:int = 768,
context_in_dim:int = 4096,
hidden_size:int = 3072,
mlp_ratio:float = 4.0,
num_heads:int = 24,
depth:int = 19,
depth_single_blocks:int = 38,
axes_dim:Optional[List[int]] = None,
theta:int = 10_000,
qkv_bias:bool = True,
):
axes_dim = axes_dim or [16, 56, 56]
self.guidance_embed = guidance_embed
self.in_channels = in_channels
self.out_channels = self.in_channels
if hidden_size % num_heads != 0:
raise ValueError(f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}")
pe_dim = hidden_size // num_heads
if sum(axes_dim) != pe_dim:
raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
self.hidden_size = hidden_size
self.num_heads = num_heads
self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
self.guidance_in:Callable[[Tensor], Tensor] = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else tensor_identity
self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
self.double_blocks = [DoubleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias) for _ in range(depth)]
self.single_blocks = [SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio) for _ in range(depth_single_blocks)]
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
def __call__(self, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, timesteps:Tensor, y:Tensor, guidance:Optional[Tensor] = None) -> Tensor:
if img.ndim != 3 or txt.ndim != 3:
raise ValueError("Input img and txt tensors must have 3 dimensions.")
# running on sequences img
img = self.img_in(img)
vec = self.time_in(timestep_embedding(timesteps, 256))
if self.guidance_embed:
if guidance is None:
raise ValueError("Didn't get guidance strength for guidance distilled model.")
vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
vec = vec + self.vector_in(y)
txt = self.txt_in(txt)
ids = Tensor.cat(txt_ids, img_ids, dim=1)
pe = self.pe_embedder(ids)
for double_block in self.double_blocks:
img, txt = double_block(img=img, txt=txt, vec=vec, pe=pe)
img = Tensor.cat(txt, img, dim=1)
for single_block in self.single_blocks:
img = single_block(img, vec=vec, pe=pe)
img = img[:, txt.shape[1] :, ...]
return self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
# https://github.com/black-forest-labs/flux/blob/main/src/flux/util.py
def load_flow_model(name:str, model_path:str):
# Loading Flux
print("Init model")
model = Flux(guidance_embed=(name != "flux-schnell"))
if not model_path: model_path = fetch(urls[name])
state_dict = {k.replace("scale", "weight"): v for k, v in safe_load(model_path).items()}
load_state_dict(model, state_dict)
return model
def load_T5(max_length:int=512):
# max length 64, 128, 256 and 512 should work (if your sequence is short enough)
print("Init T5")
T5 = T5Embedder(max_length, fetch(urls["T5_tokenizer"]))
pt_1 = fetch(urls["T5_1_of_2"])
pt_2 = fetch(urls["T5_2_of_2"])
load_state_dict(T5.encoder, safe_load(pt_1) | safe_load(pt_2), strict=False)
return T5
def load_clip():
print("Init Clip")
clip = ClipEmbedder()
load_state_dict(clip.transformer, safe_load(fetch(urls["clip"])))
return clip
def load_ae() -> AutoEncoder:
# Loading the autoencoder
print("Init AE")
ae = AutoEncoder(0.3611, 0.1159)
load_state_dict(ae, safe_load(fetch(urls["ae"])))
return ae
# https://github.com/black-forest-labs/flux/blob/main/src/flux/sampling.py
def prepare(T5:T5Embedder, clip:ClipEmbedder, img:Tensor, prompt:Union[str, List[str]]) -> Dict[str, Tensor]:
bs, _, h, w = img.shape
if bs == 1 and not isinstance(prompt, str):
bs = len(prompt)
img = img.rearrange("b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
if img.shape[0] == 1 and bs > 1:
img = img.expand((bs, *img.shape[1:]))
img_ids = Tensor.zeros(h // 2, w // 2, 3).contiguous()
img_ids[..., 1] = img_ids[..., 1] + Tensor.arange(h // 2)[:, None]
img_ids[..., 2] = img_ids[..., 2] + Tensor.arange(w // 2)[None, :]
img_ids = img_ids.rearrange("h w c -> 1 (h w) c")
img_ids = img_ids.expand((bs, *img_ids.shape[1:]))
if isinstance(prompt, str):
prompt = [prompt]
txt = T5(prompt).realize()
if txt.shape[0] == 1 and bs > 1:
txt = txt.expand((bs, *txt.shape[1:]))
txt_ids = Tensor.zeros(bs, txt.shape[1], 3)
vec = clip(prompt).realize()
if vec.shape[0] == 1 and bs > 1:
vec = vec.expand((bs, *vec.shape[1:]))
return {"img": img, "img_ids": img_ids.to(img.device), "txt": txt.to(img.device), "txt_ids": txt_ids.to(img.device), "vec": vec.to(img.device)}
def get_schedule(num_steps:int, image_seq_len:int, base_shift:float=0.5, max_shift:float=1.15, shift:bool=True) -> List[float]:
# extra step for zero
step_size = -1.0 / num_steps
timesteps = Tensor.arange(1, 0 + step_size, step_size)
# shifting the schedule to favor high timesteps for higher signal images
if shift:
# estimate mu based on linear estimation between two points
mu = 0.5 + (max_shift - base_shift) * (image_seq_len - 256) / (4096 - 256)
timesteps = math.exp(mu) / (math.exp(mu) + (1 / timesteps - 1))
return timesteps.tolist()
@TinyJit
def run(model, *args): return model(*args).realize()
def denoise(model, img:Tensor, img_ids:Tensor, txt:Tensor, txt_ids:Tensor, vec:Tensor, timesteps:List[float], guidance:float=4.0) -> Tensor:
# this is ignored for schnell
guidance_vec = Tensor((guidance,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
for t_curr, t_prev in tqdm(list(zip(timesteps[:-1], timesteps[1:])), "Denoising"):
t_vec = Tensor((t_curr,), device=img.device, dtype=img.dtype).expand((img.shape[0],))
pred = run(model, img, img_ids, txt, txt_ids, t_vec, vec, guidance_vec)
img = img + (t_prev - t_curr) * pred
return img
def unpack(x:Tensor, height:int, width:int) -> Tensor:
return x.rearrange("b (h w) (c ph pw) -> b c (h ph) (w pw)", h=math.ceil(height / 16), w=math.ceil(width / 16), ph=2, pw=2)
# https://github.com/black-forest-labs/flux/blob/main/src/flux/cli.py
if __name__ == "__main__":
default_prompt = "bananas and a can of coke"
parser = argparse.ArgumentParser(description="Run Flux.1", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--name", type=str, default="flux-schnell", help="Name of the model to load")
parser.add_argument("--model_path", type=str, default="", help="path of the model file")
parser.add_argument("--width", type=int, default=512, help="width of the sample in pixels (should be a multiple of 16)")
parser.add_argument("--height", type=int, default=512, help="height of the sample in pixels (should be a multiple of 16)")
parser.add_argument("--seed", type=int, default=None, help="Set a seed for sampling")
parser.add_argument("--prompt", type=str, default=default_prompt, help="Prompt used for sampling")
parser.add_argument('--out', type=str, default=Path(tempfile.gettempdir()) / "rendered.png", help="Output filename")
parser.add_argument("--num_steps", type=int, default=None, help="number of sampling steps (default 4 for schnell, 50 for guidance distilled)") #noqa:E501
parser.add_argument("--guidance", type=float, default=3.5, help="guidance value used for guidance distillation")
parser.add_argument("--output_dir", type=str, default="output", help="output directory")
args = parser.parse_args()
if args.name not in ["flux-schnell", "flux-dev"]:
raise ValueError(f"Got unknown model name: {args.name}, chose from flux-schnell and flux-dev")
if args.num_steps is None:
args.num_steps = 4 if args.name == "flux-schnell" else 50
# allow for packing and conversion to latent space
height = 16 * (args.height // 16)
width = 16 * (args.width // 16)
if args.seed is None: args.seed = Tensor._seed
else: Tensor.manual_seed(args.seed)
print(f"Generating with seed {args.seed}:\n{args.prompt}")
t0 = time.perf_counter()
# prepare input noise
x = Tensor.randn(1, 16, 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), dtype="bfloat16")
# load text embedders
T5 = load_T5(max_length=256 if args.name == "flux-schnell" else 512)
clip = load_clip()
# embed text to get inputs for model
inp = prepare(T5, clip, x, prompt=args.prompt)
timesteps = get_schedule(args.num_steps, inp["img"].shape[1], shift=(args.name != "flux-schnell"))
# done with text embedders
del T5, clip
# load model
model = load_flow_model(args.name, args.model_path)
# denoise initial noise
x = denoise(model, **inp, timesteps=timesteps, guidance=args.guidance)
# done with model
del model, run
# load autoencoder
ae = load_ae()
# decode latents to pixel space
x = unpack(x.float(), height, width)
x = ae.decode(x).realize()
t1 = time.perf_counter()
print(f"Done in {t1 - t0:.1f}s. Saving {args.out}")
# bring into PIL format and save
x = x.clamp(-1, 1)
x = x[0].rearrange("c h w -> h w c")
x = (127.5 * (x + 1.0)).cast("uint8")
img = Image.fromarray(x.numpy())
img.save(args.out)
# validation!
if args.prompt == default_prompt and args.name=="flux-schnell" and args.seed == 0 and args.width == args.height == 512:
ref_image = Tensor(np.array(Image.open("examples/flux1_seed0.png")))
distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
assert distance < 4e-3, colored(f"validation failed with {distance=}", "red")
print(colored(f"output validated with {distance=}", "green"))

View File

@@ -1,299 +0,0 @@
from extra.models.mask_rcnn import MaskRCNN
from extra.models.resnet import ResNet
from extra.models.mask_rcnn import BoxList
from torch.nn import functional as F
from torchvision import transforms as T
from torchvision.transforms import functional as Ft
import random
from tinygrad.tensor import Tensor
from PIL import Image
import numpy as np
import torch
import argparse
import cv2
class Resize:
def __init__(self, min_size, max_size):
if not isinstance(min_size, (list, tuple)):
min_size = (min_size,)
self.min_size = min_size
self.max_size = max_size
# modified from torchvision to add support for max size
def get_size(self, image_size):
w, h = image_size
size = random.choice(self.min_size)
max_size = self.max_size
if max_size is not None:
min_original_size = float(min((w, h)))
max_original_size = float(max((w, h)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if (w <= h and w == size) or (h <= w and h == size):
return (h, w)
if w < h:
ow = size
oh = int(size * h / w)
else:
oh = size
ow = int(size * w / h)
return (oh, ow)
def __call__(self, image):
size = self.get_size(image.size)
image = Ft.resize(image, size)
return image
class Normalize:
def __init__(self, mean, std, to_bgr255=True):
self.mean = mean
self.std = std
self.to_bgr255 = to_bgr255
def __call__(self, image):
if self.to_bgr255:
image = image[[2, 1, 0]] * 255
else:
image = image[[0, 1, 2]] * 255
image = Ft.normalize(image, mean=self.mean, std=self.std)
return image
transforms = lambda size_scale: T.Compose(
[
Resize(int(800*size_scale), int(1333*size_scale)),
T.ToTensor(),
Normalize(
mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.], to_bgr255=True
),
]
)
def expand_boxes(boxes, scale):
w_half = (boxes[:, 2] - boxes[:, 0]) * .5
h_half = (boxes[:, 3] - boxes[:, 1]) * .5
x_c = (boxes[:, 2] + boxes[:, 0]) * .5
y_c = (boxes[:, 3] + boxes[:, 1]) * .5
w_half *= scale
h_half *= scale
boxes_exp = torch.zeros_like(boxes)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
def expand_masks(mask, padding):
N = mask.shape[0]
M = mask.shape[-1]
pad2 = 2 * padding
scale = float(M + pad2) / M
padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2))
padded_mask[:, :, padding:-padding, padding:-padding] = mask
return padded_mask, scale
def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1):
# TODO: remove torch
mask = torch.tensor(mask.numpy())
box = torch.tensor(box.numpy())
padded_mask, scale = expand_masks(mask[None], padding=padding)
mask = padded_mask[0, 0]
box = expand_boxes(box[None], scale)[0]
box = box.to(dtype=torch.int32)
TO_REMOVE = 1
w = int(box[2] - box[0] + TO_REMOVE)
h = int(box[3] - box[1] + TO_REMOVE)
w = max(w, 1)
h = max(h, 1)
mask = mask.expand((1, 1, -1, -1))
mask = mask.to(torch.float32)
mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
mask = mask[0][0]
if thresh >= 0:
mask = mask > thresh
else:
mask = (mask * 255).to(torch.uint8)
im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8)
x_0 = max(box[0], 0)
x_1 = min(box[2] + 1, im_w)
y_0 = max(box[1], 0)
y_1 = min(box[3] + 1, im_h)
im_mask[y_0:y_1, x_0:x_1] = mask[
(y_0 - box[1]): (y_1 - box[1]), (x_0 - box[0]): (x_1 - box[0])
]
return im_mask
class Masker:
def __init__(self, threshold=0.5, padding=1):
self.threshold = threshold
self.padding = padding
def forward_single_image(self, masks, boxes):
boxes = boxes.convert("xyxy")
im_w, im_h = boxes.size
res = [
paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding)
for mask, box in zip(masks, boxes.bbox)
]
if len(res) > 0:
res = torch.stack(*res, dim=0)[:, None]
else:
res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1]))
return Tensor(res.numpy())
def __call__(self, masks, boxes):
if isinstance(boxes, BoxList):
boxes = [boxes]
results = []
for mask, box in zip(masks, boxes):
result = self.forward_single_image(mask, box)
results.append(result)
return results
masker = Masker(threshold=0.5, padding=1)
def select_top_predictions(predictions, confidence_threshold=0.9):
scores = predictions.get_field("scores").numpy()
keep = [idx for idx, score in enumerate(scores) if score > confidence_threshold]
return predictions[keep]
def compute_prediction(original_image, model, confidence_threshold, size_scale=1.0):
image = transforms(size_scale)(original_image).numpy()
image = Tensor(image, requires_grad=False)
predictions = model(image)
prediction = predictions[0]
prediction = select_top_predictions(prediction, confidence_threshold)
width, height = original_image.size
prediction = prediction.resize((width, height))
if prediction.has_field("mask"):
masks = prediction.get_field("mask")
masks = masker([masks], [prediction])[0]
prediction.add_field("mask", masks)
return prediction
def compute_prediction_batched(batch, model, size_scale=1.0):
imgs = []
for img in batch:
imgs.append(transforms(size_scale)(img).numpy())
image = [Tensor(image, requires_grad=False) for image in imgs]
predictions = model(image)
del image
return predictions
palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
def findContours(*args, **kwargs):
if cv2.__version__.startswith('4'):
contours, hierarchy = cv2.findContours(*args, **kwargs)
elif cv2.__version__.startswith('3'):
_, contours, hierarchy = cv2.findContours(*args, **kwargs)
return contours, hierarchy
def compute_colors_for_labels(labels):
l = labels[:, None]
colors = l * palette
colors = (colors % 255).astype("uint8")
return colors
def overlay_mask(image, predictions):
image = np.asarray(image)
masks = predictions.get_field("mask").numpy()
labels = predictions.get_field("labels").numpy()
colors = compute_colors_for_labels(labels).tolist()
for mask, color in zip(masks, colors):
thresh = mask[0, :, :, None]
contours, hierarchy = findContours(
thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
)
image = cv2.drawContours(image, contours, -1, color, 3)
composite = image
return composite
CATEGORIES = [
"__background", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant",
"bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
"sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
"carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
"toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster",
"sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
]
def overlay_boxes(image, predictions):
labels = predictions.get_field("labels").numpy()
boxes = predictions.bbox
image = np.asarray(image)
colors = compute_colors_for_labels(labels).tolist()
for box, color in zip(boxes, colors):
box = torch.tensor(box.numpy())
box = box.to(torch.int64)
top_left, bottom_right = box[:2].tolist(), box[2:].tolist()
image = cv2.rectangle(
image, tuple(top_left), tuple(bottom_right), tuple(color), 1
)
return image
def overlay_class_names(image, predictions):
scores = predictions.get_field("scores").numpy().tolist()
labels = predictions.get_field("labels").numpy().tolist()
labels = [CATEGORIES[int(i)] for i in labels]
boxes = predictions.bbox.numpy()
image = np.asarray(image)
template = "{}: {:.2f}"
for box, score, label in zip(boxes, scores, labels):
x, y = box[:2]
s = template.format(label, score)
x, y = int(x), int(y)
cv2.putText(
image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1
)
return image
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run MaskRCNN', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--image', type=str, help="Path of the image to run")
parser.add_argument('--threshold', type=float, default=0.7, help="Detector threshold")
parser.add_argument('--size_scale', type=float, default=1.0, help="Image resize multiplier")
parser.add_argument('--out', type=str, default="/tmp/rendered.png", help="Output filename")
args = parser.parse_args()
resnet = ResNet(50, num_classes=None, stride_in_1x1=True)
model_tiny = MaskRCNN(resnet)
model_tiny.load_from_pretrained()
img = Image.open(args.image)
top_result_tiny = compute_prediction(img, model_tiny, confidence_threshold=args.threshold, size_scale=args.size_scale)
bbox_image = overlay_boxes(img, top_result_tiny)
mask_image = overlay_mask(bbox_image, top_result_tiny)
final_image = overlay_class_names(mask_image, top_result_tiny)
im = Image.fromarray(final_image)
print(f"saving {args.out}")
im.save(args.out)
im.show()

View File

@@ -1,118 +0,0 @@
import json, pprint
from tinygrad import fetch, nn, Tensor
from tinygrad.helpers import DEBUG
class FeedForward:
def __init__(self, model_dim, intermediate_dim):
self.proj_1 = nn.Linear(model_dim, 2*intermediate_dim, bias=False)
self.proj_2 = nn.Linear(intermediate_dim, model_dim, bias=False)
def __call__(self, x):
y_12 = self.proj_1(x)
y_1, y_2 = y_12.chunk(2, dim=-1)
return self.proj_2(y_1.silu() * y_2)
# NOTE: this RoPE doesn't match LLaMA's?
def _rotate_half(x: Tensor) -> Tensor:
x1, x2 = x.chunk(2, dim=-1)
return Tensor.cat(-x2, x1, dim=-1)
def _apply_rotary_pos_emb(x: Tensor, pos_sin: Tensor, pos_cos: Tensor) -> Tensor:
return (x * pos_cos) + (_rotate_half(x) * pos_sin)
class Attention:
def __init__(self, model_dim, num_query_heads, num_kv_heads, head_dim):
self.qkv_proj = nn.Linear(model_dim, (num_query_heads + num_kv_heads*2) * head_dim, bias=False)
self.num_query_heads, self.num_kv_heads = num_query_heads, num_kv_heads
self.head_dim = head_dim
self.q_norm = nn.RMSNorm(head_dim)
self.k_norm = nn.RMSNorm(head_dim)
self.out_proj = nn.Linear(num_query_heads * head_dim, model_dim, bias=False)
def __call__(self, x:Tensor) -> Tensor:
batch_size, seq_len, embed_dim = x.shape
qkv = self.qkv_proj(x)
qkv = qkv.reshape(batch_size, seq_len, self.num_query_heads+self.num_kv_heads*2, self.head_dim).transpose(1, 2)
xq,xk,xv = qkv.split([self.num_query_heads, self.num_kv_heads, self.num_kv_heads], dim=1)
xq = self.q_norm(xq)
xk = self.k_norm(xk)
# add positional embedding (how many kernels is this?)
freq_constant = 10000
inv_freq = 1.0 / (freq_constant ** (Tensor.arange(0, self.head_dim, 2) / self.head_dim))
pos_index_theta = Tensor.einsum("i,j->ij", Tensor.arange(seq_len), inv_freq)
emb = Tensor.cat(pos_index_theta, pos_index_theta, dim=-1)
cos_emb, sin_emb = emb.cos()[None, None, :, :], emb.sin()[None, None, :, :]
xq = _apply_rotary_pos_emb(xq, sin_emb, cos_emb)
xk = _apply_rotary_pos_emb(xk, sin_emb, cos_emb)
# grouped-query attention
num_groups = self.num_query_heads // self.num_kv_heads
xk = xk.repeat_interleave(num_groups, dim=1)
xv = xv.repeat_interleave(num_groups, dim=1)
# masked attention
#start_pos = 0
#mask = Tensor.full((1, 1, seq_len, start_pos+seq_len), float("-inf"), dtype=xq.dtype, device=xq.device).triu(start_pos+1)
#attn_output = xq.scaled_dot_product_attention(xk, xv, mask).transpose(1, 2)
# causal is fine, no mask needed
attn_output = xq.scaled_dot_product_attention(xk, xv, is_causal=True).transpose(1, 2)
return self.out_proj(attn_output.reshape(batch_size, seq_len, self.num_query_heads * self.head_dim))
class Layer:
def __init__(self, model_dim, intermediate_dim, num_query_heads, num_kv_heads, head_dim):
self.ffn = FeedForward(model_dim, intermediate_dim)
self.attn = Attention(model_dim, num_query_heads, num_kv_heads, head_dim)
self.ffn_norm = nn.RMSNorm(model_dim)
self.attn_norm = nn.RMSNorm(model_dim)
def __call__(self, x:Tensor) -> Tensor: # (batch, seq_len, embed_dim)
x = x + self.attn(self.attn_norm(x))
x = x + self.ffn(self.ffn_norm(x))
return x
# stupidly complex
def make_divisible(v, divisor):
new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v: new_v += divisor
return new_v
class Transformer:
def __init__(self, cfg):
if DEBUG >= 3: pprint.pp(cfg)
self.layers = [Layer(cfg['model_dim'], make_divisible(int(cfg["model_dim"] * cfg['ffn_multipliers'][i]), cfg['ffn_dim_divisor']),
cfg['num_query_heads'][i], cfg['num_kv_heads'][i], cfg['head_dim']) for i in range(cfg['num_transformer_layers'])]
self.norm = nn.RMSNorm(cfg['model_dim'])
self.token_embeddings = nn.Embedding(cfg['vocab_size'], cfg['model_dim'])
def __call__(self, tokens:Tensor):
# _bsz, seqlen = tokens.shape
x = self.token_embeddings(tokens)
for l in self.layers: x = l(x)
return self.norm(x) @ self.token_embeddings.weight.T
if __name__ == "__main__":
#model_name = "OpenELM-270M-Instruct"
model_name = "OpenELM-270M" # this is fp32
model = Transformer(json.loads(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/config.json?download=true").read_bytes()))
weights = nn.state.safe_load(fetch(f"https://huggingface.co/apple/{model_name}/resolve/main/model.safetensors?download=true"))
if DEBUG >= 3:
for k, v in weights.items(): print(k, v.shape)
nn.state.load_state_dict(model, {k.removeprefix("transformer."):v for k,v in weights.items()})
from sentencepiece import SentencePieceProcessor
tokenizer = SentencePieceProcessor(fetch("https://github.com/karpathy/llama2.c/raw/master/tokenizer.model").as_posix())
toks = [tokenizer.bos_id()] + tokenizer.encode("Some car brands include")
for i in range(100):
ttoks = Tensor([toks])
out = model(ttoks).realize()
t0 = out[0].argmax(axis=-1).tolist()
toks.append(t0[-1])
# hmmm...passthrough still doesn't match (it shouldn't, it outputs the most likely)
print(tokenizer.decode(toks))
#print(toks)
#print(tokenizer.decode(t0))
#print(t0)

View File

@@ -1,55 +0,0 @@
from tinygrad.helpers import trange
from tinygrad.nn.datasets import mnist
import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
from functools import partial
class Model(nn.Module):
def __init__(self):
super().__init__()
self.c1 = nn.Conv2d(1, 32, 5)
self.c2 = nn.Conv2d(32, 32, 5)
self.bn1 = nn.BatchNorm(32)
self.m1 = nn.MaxPool2d(2)
self.c3 = nn.Conv2d(32, 64, 3)
self.c4 = nn.Conv2d(64, 64, 3)
self.bn2 = nn.BatchNorm(64)
self.m2 = nn.MaxPool2d(2)
self.lin = nn.Linear(576, 10)
def __call__(self, x):
x = mx.maximum(self.c1(x), 0)
x = mx.maximum(self.c2(x), 0)
x = self.m1(self.bn1(x))
x = mx.maximum(self.c3(x), 0)
x = mx.maximum(self.c4(x), 0)
x = self.m2(self.bn2(x))
return self.lin(mx.flatten(x, 1))
if __name__ == "__main__":
X_train, Y_train, X_test, Y_test = mnist()
X_train = mx.array(X_train.float().permute((0,2,3,1)).numpy())
Y_train = mx.array(Y_train.numpy())
X_test = mx.array(X_test.float().permute((0,2,3,1)).numpy())
Y_test = mx.array(Y_test.numpy())
model = Model()
optimizer = optim.Adam(1e-3)
def loss_fn(model, x, y): return nn.losses.cross_entropy(model(x), y).mean()
state = [model.state, optimizer.state]
@partial(mx.compile, inputs=state, outputs=state)
def step(samples):
# Compiled functions will also treat any inputs not in the parameter list as constants.
X,Y = X_train[samples], Y_train[samples]
loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
loss, grads = loss_and_grad_fn(model, X, Y)
optimizer.update(model, grads)
return loss
test_acc = float('nan')
for i in (t:=trange(70)):
samples = mx.random.randint(0, X_train.shape[0], (512,)) # putting this in JIT didn't work well
loss = step(samples)
if i%10 == 9: test_acc = ((model(X_test).argmax(axis=-1) == Y_test).sum() * 100 / X_test.shape[0]).item()
t.set_description(f"loss: {loss.item():6.2f} test_accuracy: {test_acc:5.2f}%")

View File

@@ -1,45 +0,0 @@
import gymnasium as gym
import numpy as np
from gymnasium.envs.registration import register
# a very simple game
# one of <size> lights will light up
# take the action of the lit up light
# in <hard_mode>, you act differently based on the step number and need to track this
class PressTheLightUpButton(gym.Env):
metadata = {"render_modes": []}
def __init__(self, render_mode=None, size=2, game_length=10, hard_mode=False):
self.size, self.game_length = size, game_length
self.observation_space = gym.spaces.Box(0, 1, shape=(self.size,), dtype=np.float32)
self.action_space = gym.spaces.Discrete(self.size)
self.step_num = 0
self.done = True
self.hard_mode = hard_mode
def _get_obs(self):
obs = [0]*self.size
if self.step_num < len(self.state):
obs[self.state[self.step_num]] = 1
return np.array(obs, dtype=np.float32)
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.state = np.random.randint(0, self.size, size=self.game_length)
self.step_num = 0
self.done = False
return self._get_obs(), {}
def step(self, action):
target = ((action + self.step_num) % self.size) if self.hard_mode else action
reward = int(target == self.state[self.step_num])
self.step_num += 1
if not reward:
self.done = True
return self._get_obs(), reward, self.done, self.step_num >= self.game_length, {}
register(
id="PressTheLightUpButton-v0",
entry_point="examples.rl.lightupbutton:PressTheLightUpButton",
max_episode_steps=None,
)

View File

@@ -1,136 +0,0 @@
#!/usr/bin/env python
#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
import sys
import numpy as np
from tinygrad.nn.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.nn import BatchNorm2d, optim
from tinygrad.helpers import getenv
from extra.datasets import fetch_mnist
from extra.augment import augment_img
from extra.training import train, evaluate
GPU = getenv("GPU")
QUICK = getenv("QUICK")
DEBUG = getenv("DEBUG")
class SqueezeExciteBlock2D:
def __init__(self, filters):
self.filters = filters
self.weight1 = Tensor.scaled_uniform(self.filters, self.filters//32)
self.bias1 = Tensor.scaled_uniform(1,self.filters//32)
self.weight2 = Tensor.scaled_uniform(self.filters//32, self.filters)
self.bias2 = Tensor.scaled_uniform(1, self.filters)
def __call__(self, input):
se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
se = se.reshape(shape=(-1, self.filters))
se = se.dot(self.weight1) + self.bias1
se = se.relu()
se = se.dot(self.weight2) + self.bias2
se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting
se = input.mul(se)
return se
class ConvBlock:
def __init__(self, h, w, inp, filters=128, conv=3):
self.h, self.w = h, w
self.inp = inp
#init weights
self.cweights = [Tensor.scaled_uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
self.cbiases = [Tensor.scaled_uniform(1, filters, 1, 1) for i in range(3)]
#init layers
self._bn = BatchNorm2d(128)
self._seb = SqueezeExciteBlock2D(filters)
def __call__(self, input):
x = input.reshape(shape=(-1, self.inp, self.w, self.h))
for cweight, cbias in zip(self.cweights, self.cbiases):
x = x.pad(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
x = self._bn(x)
x = self._seb(x)
return x
class BigConvNet:
def __init__(self):
self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
self.weight1 = Tensor.scaled_uniform(128,10)
self.weight2 = Tensor.scaled_uniform(128,10)
def parameters(self):
if DEBUG: #keeping this for a moment
pars = [par for par in get_parameters(self) if par.requires_grad]
no_pars = 0
for par in pars:
print(par.shape)
no_pars += np.prod(par.shape)
print('no of parameters', no_pars)
return pars
else:
return get_parameters(self)
def save(self, filename):
with open(filename+'.npy', 'wb') as f:
for par in get_parameters(self):
#if par.requires_grad:
np.save(f, par.numpy())
def load(self, filename):
with open(filename+'.npy', 'rb') as f:
for par in get_parameters(self):
#if par.requires_grad:
try:
par.numpy()[:] = np.load(f)
if GPU:
par.gpu()
except:
print('Could not load parameter')
def forward(self, x):
x = self.conv[0](x)
x = self.conv[1](x)
x = x.avg_pool2d(kernel_size=(2,2))
x = self.conv[2](x)
x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
xo = x1.dot(self.weight1) + x2.dot(self.weight2)
return xo
if __name__ == "__main__":
lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
epochss = [2, 1] if QUICK else [13, 3, 3, 1]
BS = 32
lmbd = 0.00025
lossfn = lambda out,y: out.sparse_categorical_crossentropy(y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
X_train, Y_train, X_test, Y_test = fetch_mnist()
X_train = X_train.reshape(-1, 28, 28).astype(np.uint8)
X_test = X_test.reshape(-1, 28, 28).astype(np.uint8)
steps = len(X_train)//BS
np.random.seed(1337)
if QUICK:
steps = 1
X_test, Y_test = X_test[:BS], Y_test[:BS]
model = BigConvNet()
if len(sys.argv) > 1:
try:
model.load(sys.argv[1])
print('Loaded weights "'+sys.argv[1]+'", evaluating...')
evaluate(model, X_test, Y_test, BS=BS)
except:
print('could not load weights "'+sys.argv[1]+'".')
if GPU:
params = get_parameters(model)
[x.gpu_() for x in params]
for lr, epochs in zip(lrs, epochss):
optimizer = optim.Adam(model.parameters(), lr=lr)
for epoch in range(1,epochs+1):
#first epoch without augmentation
X_aug = X_train if epoch == 1 else augment_img(X_train)
train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, BS=BS)
accuracy = evaluate(model, X_test, Y_test, BS=BS)
model.save(f'examples/checkpoint{accuracy * 1e6:.0f}')

View File

@@ -1,17 +0,0 @@
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d, BatchNorm2d
from tinygrad.nn.state import get_parameters
if __name__ == "__main__":
with Tensor.train():
BS, C1, H, W = 4, 16, 224, 224
C2, K, S, P = 64, 7, 2, 1
x = Tensor.uniform(BS, C1, H, W)
conv = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
bn = BatchNorm2d(C2, track_running_stats=False)
for t in get_parameters([x, conv, bn]): t.realize()
print("running network")
x.sequential([conv, bn]).numpy()

View File

@@ -1,669 +0,0 @@
# original implementation: https://github.com/svc-develop-team/so-vits-svc
from __future__ import annotations
import sys, logging, time, io, math, argparse, operator, numpy as np
from functools import partial, reduce
from pathlib import Path
from typing import Tuple, Optional, Type
from tinygrad import nn, dtypes, Tensor
from tinygrad.helpers import getenv, fetch
from tinygrad.nn.state import torch_load
from examples.vits import ResidualCouplingBlock, PosteriorEncoder, Encoder, ResBlock1, ResBlock2, LRELU_SLOPE, sequence_mask, split, get_hparams_from_file, load_checkpoint, weight_norm, HParams
from examples.sovits_helpers import preprocess
import soundfile
DEBUG = getenv("DEBUG")
F0_BIN = 256
F0_MAX = 1100.0
F0_MIN = 50.0
F0_MEL_MIN = 1127 * np.log(1 + F0_MIN / 700)
F0_MEL_MAX = 1127 * np.log(1 + F0_MAX / 700)
class SpeechEncoder:
def __init__(self, hidden_dim, model:ContentVec): self.hidden_dim, self.model = hidden_dim, model
def encode(self, ): raise NotImplementedError("implement me")
@classmethod
def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
contentvec = ContentVec.load_from_pretrained(checkpoint_path, checkpoint_url)
return cls(contentvec)
class ContentVec256L9(SpeechEncoder):
def __init__(self, model:ContentVec): super().__init__(hidden_dim=256, model=model)
def encode(self, wav: Tensor):
feats = wav
if len(feats.shape) == 2: # double channels
feats = feats.mean(-1)
assert len(feats.shape) == 1, feats.dim()
feats = feats.reshape(1, -1)
padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=9)
feats = self.model.final_proj(logits[0])
return feats.transpose(1,2)
class ContentVec768L12(SpeechEncoder):
def __init__(self, model:ContentVec): super().__init__(hidden_dim=768, model=model)
def encode(self, wav: Tensor):
feats = wav
if len(feats.shape) == 2: # double channels
feats = feats.mean(-1)
assert len(feats.shape) == 1, feats.dim()
feats = feats.reshape(1, -1)
padding_mask = Tensor.zeros_like(feats).cast(dtypes.bool)
logits = self.model.extract_features(feats.to(wav.device), padding_mask=padding_mask.to(wav.device), output_layer=12)
return logits[0].transpose(1,2)
# original code for contentvec: https://github.com/auspicious3000/contentvec/
class ContentVec:
# self.final_proj dims are hardcoded and depend on fairseq.data.dictionary Dictionary in the checkpoint. This param can't yet be loaded since there is no pickle for it. See with DEBUG=2.
# This means that the ContentVec only works with the hubert weights used in all SVC models
def __init__(self, cfg: HParams):
self.feature_grad_mult, self.untie_final_proj = cfg.feature_grad_mult, cfg.untie_final_proj
feature_enc_layers = eval(cfg.conv_feature_layers)
self.embed = feature_enc_layers[-1][0]
final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
self.feature_extractor = ConvFeatureExtractionModel(conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias)
self.post_extract_proj = nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None
self.encoder = TransformerEncoder(cfg)
self.layer_norm = nn.LayerNorm(self.embed)
self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim * 1) if self.untie_final_proj else nn.Linear(cfg.encoder_embed_dim, final_dim)
self.mask_emb = Tensor.uniform(cfg.encoder_embed_dim, dtype=dtypes.float32)
self.label_embs_concat = Tensor.uniform(504, final_dim, dtype=dtypes.float32)
def forward_features(self, source, padding_mask):
if self.feature_grad_mult > 0:
features = self.feature_extractor(source, padding_mask)
if self.feature_grad_mult != 1.0: pass # training: GradMultiply.forward(features, self.feature_grad_mult)
else:
features = self.feature_extractor(source, padding_mask)
return features
def forward_padding_mask(self, features, padding_mask): # replaces original forward_padding_mask for batch inference
lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1) # ensure its bool for tilde
lengths = (lengths_org - 400).float().div(320).floor().cast(dtypes.int64) + 1 # intermediate float to divide
padding_mask = lengths_to_padding_mask(lengths)
return padding_mask
def extract_features(self, source: Tensor, spk_emb:Tensor=None, padding_mask=None, ret_conv=False, output_layer=None, tap=False):
features = self.forward_features(source, padding_mask)
if padding_mask is not None:
padding_mask = self.forward_padding_mask(features, padding_mask)
features = features.transpose(1, 2)
features = self.layer_norm(features)
if self.post_extract_proj is not None:
features = self.post_extract_proj(features)
x, _ = self.encoder(features, spk_emb, padding_mask=padding_mask, layer=(None if output_layer is None else output_layer - 1), tap=tap)
res = features if ret_conv else x
return res, padding_mask
@classmethod
def load_from_pretrained(cls, checkpoint_path:str, checkpoint_url:str) -> ContentVec:
fetch(checkpoint_url, checkpoint_path)
cfg = load_fairseq_cfg(checkpoint_path)
enc = cls(cfg.model)
_ = load_checkpoint_enc(checkpoint_path, enc, None)
logging.debug(f"{cls.__name__}: Loaded model with cfg={cfg}")
return enc
class TransformerEncoder:
def __init__(self, cfg: HParams):
def make_conv() -> nn.Conv1d:
layer = nn.Conv1d(self.embedding_dim, self.embedding_dim, kernel_size=cfg.conv_pos, padding=cfg.conv_pos // 2, groups=cfg.conv_pos_groups)
std = std = math.sqrt(4 / (cfg.conv_pos * self.embedding_dim))
layer.weight, layer.bias = (Tensor.normal(*layer.weight.shape, std=std)), (Tensor.zeros(*layer.bias.shape))
# for training: layer.weights need to be weight_normed
return layer
self.dropout, self.embedding_dim, self.layer_norm_first, self.layerdrop, self.num_layers, self.num_layers_1 = cfg.dropout, cfg.encoder_embed_dim, cfg.layer_norm_first, cfg.encoder_layerdrop, cfg.encoder_layers, cfg.encoder_layers_1
self.pos_conv, self.pos_conv_remove = [make_conv()], (1 if cfg.conv_pos % 2 == 0 else 0)
self.layers = [
TransformerEncoderLayer(self.embedding_dim, cfg.encoder_ffn_embed_dim, cfg.encoder_attention_heads, self.dropout, cfg.attention_dropout, cfg.activation_dropout, cfg.activation_fn, self.layer_norm_first, cond_layer_norm=(i >= cfg.encoder_layers))
for i in range(cfg.encoder_layers + cfg.encoder_layers_1)
]
self.layer_norm = nn.LayerNorm(self.embedding_dim)
self.cond_layer_norm = CondLayerNorm(self.embedding_dim) if cfg.encoder_layers_1 > 0 else None
# training: apply init_bert_params
def __call__(self, x, spk_emb, padding_mask=None, layer=None, tap=False):
x, layer_results = self.extract_features(x, spk_emb, padding_mask, layer, tap)
if self.layer_norm_first and layer is None:
x = self.cond_layer_norm(x, spk_emb) if (self.num_layers_1 > 0) else self.layer_norm(x)
return x, layer_results
def extract_features(self, x: Tensor, spk_emb: Tensor, padding_mask=None, tgt_layer=None, tap=False):
if tgt_layer is not None: # and not self.training
assert tgt_layer >= 0 and tgt_layer < len(self.layers)
if padding_mask is not None:
# x[padding_mask] = 0
assert padding_mask.shape == x.shape[:len(padding_mask.shape)] # first few dims of x must match padding_mask
tmp_mask = padding_mask.unsqueeze(-1).repeat((1, 1, x.shape[-1]))
tmp_mask = tilde(tmp_mask.cast(dtypes.bool))
x = tmp_mask.where(x, 0)
x_conv = self.pos_conv[0](x.transpose(1,2))
if self.pos_conv_remove > 0: x_conv = x_conv[:, :, : -self.pos_conv_remove]
x_conv = x_conv.gelu().transpose(1, 2)
x = (x + x_conv).transpose(0, 1) # B x T x C -> T x B x C
if not self.layer_norm_first: x = self.layer_norm(x)
x = x.dropout(p=self.dropout)
layer_results = []
r = None
for i, layer in enumerate(self.layers):
if i < self.num_layers: # if (not self.training or (dropout_probability > self.layerdrop)) and (i < self.num_layers):
assert layer.cond_layer_norm == False
x = layer(x, self_attn_padding_mask=padding_mask, need_weights=False)
if tgt_layer is not None or tap:
layer_results.append(x.transpose(0, 1))
if i>= self.num_layers:
assert layer.cond_layer_norm == True
x = layer(x, emb=spk_emb, self_attn_padding_mask=padding_mask, need_weights=False)
if i == tgt_layer:
r = x
break
if r is not None:
x = r
x = x.transpose(0, 1) # T x B x C -> B x T x C
return x, layer_results
class TransformerEncoderLayer:
def __init__(self, embedding_dim=768.0, ffn_embedding_dim=3072.0, num_attention_heads=8.0, dropout=0.1, attention_dropout=0.1, activation_dropout=0.1, activation_fn="relu", layer_norm_first=False, cond_layer_norm=False):
def get_activation_fn(activation):
if activation == "relu": return Tensor.relu
if activation == "gelu": return Tensor.gelu
else: raise RuntimeError(f"activation function={activation} is not forseen")
self.embedding_dim, self.dropout, self.activation_dropout, self.layer_norm_first, self.num_attention_heads, self.cond_layer_norm, self.activation_fn = embedding_dim, dropout, activation_dropout, layer_norm_first, num_attention_heads, cond_layer_norm, get_activation_fn(activation_fn)
self.self_attn = MultiHeadAttention(self.embedding_dim, self.num_attention_heads)
self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
self.final_layer_norm = nn.LayerNorm(self.embedding_dim) if not cond_layer_norm else CondLayerNorm(self.embedding_dim)
def __call__(self, x:Tensor, self_attn_mask:Tensor=None, self_attn_padding_mask:Tensor=None, emb:Tensor=None, need_weights=False):
#self_attn_padding_mask = self_attn_padding_mask.reshape(x.shape[0], 1, 1, self_attn_padding_mask.shape[1]).expand(-1, self.num_attention_heads, -1, -1).reshape(x.shape[0] * self.num_attention_heads, 1, self_attn_padding_mask.shape[1]) if self_attn_padding_mask is not None else None
assert self_attn_mask is None and self_attn_padding_mask is not None
residual = x
if self.layer_norm_first:
x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
x = self.self_attn(x=x, mask=self_attn_padding_mask)
x = x.dropout(self.dropout)
x = residual + x
x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
x = self.activation_fn(self.fc1(x))
x = x.dropout(self.activation_dropout)
x = self.fc2(x)
x = x.dropout(self.dropout)
x = residual + x
else:
x = self.self_attn(x=x, mask=self_attn_padding_mask)
x = x.dropout(self.dropout)
x = residual + x
x = self.self_attn_layer_norm(x) if not self.cond_layer_norm else self.self_attn_layer_norm(x, emb)
residual = x
x = self.activation_fn(self.fc1(x))
x = x.dropout(self.activation_dropout)
x = self.fc2(x)
x = x.dropout(self.dropout)
x = residual + x
x = self.final_layer_norm(x) if not self.cond_layer_norm else self.final_layer_norm(x, emb)
return x
class MultiHeadAttention:
def __init__(self, n_state, n_head):
self.n_state, self.n_head = n_state, n_head
self.q_proj, self.k_proj, self.v_proj, self.out_proj = [nn.Linear(n_state, n_state) for _ in range(4)]
def __call__(self, x:Tensor, xa:Optional[Tensor]=None, mask:Optional[Tensor]=None):
x = x.transpose(0,1) # TxBxC -> BxTxC
q, k, v = self.q_proj(x), self.k_proj(xa or x), self.v_proj(xa or x)
q, k, v = [x.reshape(*q.shape[:2], self.n_head, -1) for x in (q, k, v)]
wv = Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), None).transpose(1, 2).reshape(*x.shape[:2], -1)
ret = self.out_proj(wv).transpose(0,1) # BxTxC -> TxBxC
return ret
class ConvFeatureExtractionModel:
def __init__(self, conv_layers, dropout=.0, mode="default", conv_bias=False):
assert mode in {"default", "group_norm_masked", "layer_norm"}
def block(n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False):
def make_conv():
conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
conv.weight = Tensor.kaiming_normal(*conv.weight.shape)
return conv
assert (is_layer_norm and is_group_norm) == False, "layer norm and group norm are exclusive"
if is_layer_norm:
return [make_conv(), partial(Tensor.dropout, p=dropout),[partial(Tensor.transpose, dim0=-2, dim1=-1), nn.LayerNorm(dim, elementwise_affine=True), partial(Tensor.transpose, dim0=-2, dim1=-1)], Tensor.gelu]
elif is_group_norm and mode == "default":
return [make_conv(), partial(Tensor.dropout, p=dropout), nn.GroupNorm(dim, dim, affine=True), Tensor.gelu]
elif is_group_norm and mode == "group_norm_masked":
return [make_conv(), partial(Tensor.dropout, p=dropout), GroupNormMasked(dim, dim, affine=True), Tensor.gelu]
else:
return [make_conv(), partial(Tensor.dropout, p=dropout), Tensor.gelu]
in_d, self.conv_layers, self.mode = 1, [], mode
for i, cl in enumerate(conv_layers):
assert len(cl) == 3, "invalid conv definition: " + str(cl)
(dim, k, stride) = cl
if i == 0: self.cl = cl
self.conv_layers.append(block(in_d, dim, k, stride, is_layer_norm=(mode == "layer_norm"), is_group_norm=((mode == "default" or mode == "group_norm_masked") and i == 0), conv_bias=conv_bias))
in_d = dim
def __call__(self, x:Tensor, padding_mask:Tensor):
x = x.unsqueeze(1) # BxT -> BxCxT
if self.mode == "group_norm_masked":
if padding_mask is not None:
_, k, stride = self.cl
lengths_org = tilde(padding_mask.cast(dtypes.bool)).cast(dtypes.int64).sum(1) # ensure padding_mask is bool for tilde
lengths = (((lengths_org - k) / stride) + 1).floor().cast(dtypes.int64)
padding_mask = tilde(lengths_to_padding_mask(lengths)).cast(dtypes.int64) # lengths_to_padding_mask returns bool tensor
x = self.conv_layers[0][0](x) # padding_mask is numeric
x = self.conv_layers[0][1](x)
x = self.conv_layers[0][2](x, padding_mask)
x = self.conv_layers[0][3](x)
else:
x = x.sequential(self.conv_layers[0]) # default
for _, conv in enumerate(self.conv_layers[1:], start=1):
conv = reduce(lambda a,b: operator.iconcat(a,b if isinstance(b, list) else [b]), conv, []) # flatten
x = x.sequential(conv)
return x
class CondLayerNorm: # https://github.com/auspicious3000/contentvec/blob/main/contentvec/modules/cond_layer_norm.py#L10
def __init__(self, dim_last, eps=1e-5, dim_spk=256, elementwise_affine=True):
self.dim_last, self.eps, self.dim_spk, self.elementwise_affine = dim_last, eps, dim_spk, elementwise_affine
if self.elementwise_affine:
self.weight_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
self.bias_ln = nn.Linear(self.dim_spk, self.dim_last, bias=False)
self.weight_ln.weight, self.bias_ln.weight = (Tensor.ones(*self.weight_ln.weight.shape)), (Tensor.zeros(*self.bias_ln.weight.shape))
def __call__(self, x: Tensor, spk_emb: Tensor):
axis = tuple(-1-i for i in range(len(x.shape[1:])))
x = x.layernorm(axis=axis, eps=self.eps)
if not self.elementwise_affine: return x
weights, bias = self.weight_ln(spk_emb), self.bias_ln(spk_emb)
return weights * x + bias
class GroupNormMasked: # https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/modules/fp32_group_norm.py#L16
def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
self.num_groups, self.num_channels, self.eps, self.affine = num_groups, num_channels, eps, affine
self.weight, self.bias = (Tensor.ones(num_channels)), (Tensor.zeros(num_channels)) if self.affine else (None, None)
def __call__(self, x:Tensor, mask:Tensor):
bsz, n_c, length = x.shape
assert n_c % self.num_groups == 0
x = x.reshape(bsz, self.num_groups, n_c // self.num_groups, length)
if mask is None: mask = Tensor.ones_like(x)
else: mask = mask.reshape(bsz, 1, 1, length)
x = x * mask
lengths = mask.sum(axis=3, keepdim=True)
assert x.shape[2] == 1
mean_ = x.mean(dim=3, keepdim=True)
mean = mean_ * length / lengths
var = (((x.std(axis=3, keepdim=True) ** 2) + mean_**2) * length / lengths - mean**2) + self.eps
return x.add(-mean).div(var.sqrt()).reshape(bsz, n_c, length).mul(self.weight.reshape(1,-1,1)).add(self.bias.reshape(1,-1,1))
class Synthesizer:
def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, ssl_dim, n_speakers, sampling_rate=44100, vol_embedding=False, n_flow_layer=4, **kwargs):
self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.vol_embedding = spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, vol_embedding
self.emb_g = nn.Embedding(n_speakers, gin_channels)
if vol_embedding: self.emb_vol = nn.Linear(1, hidden_channels)
self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
self.enc_p = TextEncoder(inter_channels, hidden_channels, kernel_size, n_layers, filter_channels=filter_channels, n_heads=n_heads, p_dropout=p_dropout)
self.dec = Generator(sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels)
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels)
self.emb_uv = nn.Embedding(vocab_size=2, embed_size=hidden_channels)
def infer(self, c:Tensor, f0:Tensor, uv:Tensor, g:Tensor=None, noise_scale=0.35, seed=52468, vol=None) -> Tuple[Tensor, Tensor]:
Tensor.manual_seed(getenv('SEED', seed))
c_lengths = (Tensor.ones([c.shape[0]]) * c.shape[-1]).to(c.device)
if len(g.shape) == 1: g = g.unsqueeze(0)
g = self.emb_g(g).transpose(1, 2)
x_mask = sequence_mask(c_lengths, c.shape[2]).unsqueeze(1).cast(c.dtype)
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
x = self.pre(c) * x_mask + self.emb_uv(uv.cast(dtypes.int64)).transpose(1, 2) + vol
z_p, _, _, c_mask = self.enc_p.forward(x, x_mask, f0=self._f0_to_coarse(f0), noise_scale=noise_scale)
z = self.flow.forward(z_p, c_mask, g=g, reverse=True)
o = self.dec.forward(z * c_mask, g=g, f0=f0)
return o,f0
def _f0_to_coarse(self, f0 : Tensor):
f0_mel = 1127 * (1 + f0 / 700).log()
a = (F0_BIN - 2) / (F0_MEL_MAX - F0_MEL_MIN)
b = F0_MEL_MIN * a - 1.
f0_mel = (f0_mel > 0).where(f0_mel * a - b, f0_mel)
f0_coarse = f0_mel.ceil().cast(dtype=dtypes.int64)
f0_coarse = f0_coarse * (f0_coarse > 0)
f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
f0_coarse = f0_coarse * (f0_coarse < F0_BIN)
f0_coarse = f0_coarse + ((f0_coarse >= F0_BIN) * (F0_BIN - 1))
return f0_coarse
@classmethod
def load_from_pretrained(cls, config_path:str, config_url:str, weights_path:str, weights_url:str) -> Synthesizer:
fetch(config_url, config_path)
hps = get_hparams_from_file(config_path)
fetch(weights_url, weights_path)
net_g = cls(hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model)
_ = load_checkpoint(weights_path, net_g, None, skip_list=["f0_decoder"])
logging.debug(f"{cls.__name__}:Loaded model with hps: {hps}")
return net_g, hps
class TextEncoder:
def __init__(self, out_channels, hidden_channels, kernel_size, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None):
self.out_channels, self.hidden_channels, self.kernel_size, self.n_layers, self.gin_channels = out_channels, hidden_channels, kernel_size, n_layers, gin_channels
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
self.f0_emb = nn.Embedding(256, hidden_channels) # n_vocab = 256
self.enc_ = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
def forward(self, x, x_mask, f0=None, noise_scale=1):
x = x + self.f0_emb(f0).transpose(1, 2)
x = self.enc_.forward(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = split(stats, self.out_channels, dim=1)
z = (m + randn_like(m) * logs.exp() * noise_scale) * x_mask
return z, m, logs, x_mask
class Upsample:
def __init__(self, scale_factor):
assert scale_factor % 1 == 0, "Only integer scale factor allowed."
self.scale = int(scale_factor)
def forward(self, x:Tensor):
repeats = tuple([1] * len(x.shape) + [self.scale])
new_shape = (*x.shape[:-1], x.shape[-1] * self.scale)
return x.unsqueeze(-1).repeat(repeats).reshape(new_shape)
class SineGen:
def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voice_threshold=0, flag_for_pulse=False):
self.sine_amp, self.noise_std, self.harmonic_num, self.sampling_rate, self.voiced_threshold, self.flag_for_pulse = sine_amp, noise_std, harmonic_num, samp_rate, voice_threshold, flag_for_pulse
self.dim = self.harmonic_num + 1
def _f02uv(self, f0): return (f0 > self.voiced_threshold).float() #generate uv signal
def _f02sine(self, f0_values):
def padDiff(x : Tensor): return (x.pad((0,0,-1,1)) - x).pad((0,0,0,-1))
def mod(x: Tensor, n: int) -> Tensor: return x - n * x.div(n).floor() # this is what the % operator does in pytorch.
rad_values = mod((f0_values / self.sampling_rate) , 1) # convert to F0 in rad
rand_ini = Tensor.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device) # initial phase noise
#rand_ini[:, 0] = 0
m = Tensor.ones(f0_values.shape[0]).unsqueeze(1).pad((0,f0_values.shape[2]-1,0,0)).cast(dtypes.bool)
m = tilde(m)
rand_ini = m.where(rand_ini, 0)
#rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp = rad_values[:, 0, :] + rand_ini
m = Tensor.ones(tmp.shape).pad((0,0,0,rad_values.shape[1]-1,0)).cast(dtypes.bool)
m = tilde(m)
tmp = tmp.unsqueeze(1).pad((0,0,0,rad_values.shape[1]-1,0))
rad_values = m.where(rad_values, tmp)
tmp_over_one = mod(rad_values.cumsum(1), 1)
tmp_over_one_idx = padDiff(tmp_over_one) < 0
cumsum_shift = Tensor.zeros_like(rad_values)
#cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
tmp_over_one_idx = (tmp_over_one_idx * -1.0).pad((0,0,1,0))
cumsum_shift = tmp_over_one_idx
sines = ((rad_values + cumsum_shift).cumsum(1) * 2 * np.pi).sin()
return sines
def forward(self, f0, upp=None):
fn = f0.mul(Tensor([[range(1, self.harmonic_num + 2)]], dtype=dtypes.float32).to(f0.device))
sine_waves = self._f02sine(fn) * self.sine_amp #generate sine waveforms
uv = self._f02uv(f0) # generate uv signal
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceHnNSF:
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0):
self.sine_amp, self.noise_std = sine_amp, add_noise_std
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold)
self.l_linear = nn.Linear(harmonic_num + 1, 1)
def forward(self, x, upp=None):
sine_waves, uv, _ = self.l_sin_gen.forward(x, upp)
sine_merge = self.l_linear(sine_waves.cast(self.l_linear.weight.dtype)).tanh()
noise = randn_like(uv) * self.sine_amp / 3
return sine_merge, noise, uv
# most of the hifigan in standard vits is reused here, but need to upsample and construct harmonic source from f0
class Generator:
def __init__(self, sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels):
self.sampling_rate, self.inter_channels, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.gin_channels = sampling_rate, inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels
self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
self.conv_pre = nn.Conv1d(inter_channels, upsample_initial_channel, 7, 1, padding=3)
self.f0_upsamp = Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceHnNSF(sampling_rate, harmonic_num=8)
resblock = ResBlock1 if resblock == '1' else ResBlock2
self.ups, self.noise_convs, self.resblocks = [], [], []
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
c_cur = upsample_initial_channel//(2**(i+1))
self.ups.append(nn.ConvTranspose1d(upsample_initial_channel//(2**i), c_cur, k, u, padding=(k-u)//2))
stride_f0 = int(np.prod(upsample_rates[i + 1:]))
self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2) if (i + 1 < len(upsample_rates)) else nn.Conv1d(1, c_cur, kernel_size=1))
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3)
if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp = np.prod(upsample_rates)
def forward(self, x, f0, g=None):
f0 = self.f0_upsamp.forward(f0[:, None]).transpose(1, 2) # bs,n,t
har_source, _, _ = self.m_source.forward(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None: x = x + self.cond(g)
for i in range(self.num_upsamples):
x, xs = self.ups[i](x.leaky_relu(LRELU_SLOPE)), None
x_source = self.noise_convs[i](har_source)
x = x + x_source
for j in range(self.num_kernels):
if xs is None: xs = self.resblocks[i * self.num_kernels + j].forward(x)
else: xs += self.resblocks[i * self.num_kernels + j].forward(x)
x = xs / self.num_kernels
return self.conv_post(x.leaky_relu()).tanh()
# **** helpers ****
def randn_like(x:Tensor) -> Tensor: return Tensor.randn(*x.shape, dtype=x.dtype).to(device=x.device)
def tilde(x: Tensor) -> Tensor:
if x.dtype == dtypes.bool: return (1 - x).cast(dtypes.bool)
return (x + 1) * -1 # this seems to be what the ~ operator does in pytorch for non bool
def lengths_to_padding_mask(lens:Tensor) -> Tensor:
bsz, max_lens = lens.shape[0], lens.max().numpy().item()
mask = Tensor.arange(max_lens).to(lens.device).reshape(1, max_lens)
mask = mask.expand(bsz, -1) >= lens.reshape(bsz, 1).expand(-1, max_lens)
return mask.cast(dtypes.bool)
def repeat_expand_2d_left(content, target_len): # content : [h, t]
src_len = content.shape[-1]
temp = np.arange(src_len+1) * target_len / src_len
current_pos, cols = 0, []
for i in range(target_len):
if i >= temp[current_pos+1]:
current_pos += 1
cols.append(content[:, current_pos])
return Tensor.stack(*cols).transpose(0, 1)
def load_fairseq_cfg(checkpoint_path):
assert Path(checkpoint_path).is_file()
state = torch_load(checkpoint_path)
cfg = state["cfg"] if ("cfg" in state and state["cfg"] is not None) else None
if cfg is None: raise RuntimeError(f"No cfg exist in state keys = {state.keys()}")
return HParams(**cfg)
def load_checkpoint_enc(checkpoint_path, model: ContentVec, optimizer=None, skip_list=[]):
assert Path(checkpoint_path).is_file()
start_time = time.time()
checkpoint_dict = torch_load(checkpoint_path)
saved_state_dict = checkpoint_dict['model']
weight_g, weight_v, parent = None, None, None
for key, v in saved_state_dict.items():
if any(layer in key for layer in skip_list): continue
try:
obj, skip = model, False
for k in key.split('.'):
if k.isnumeric(): obj = obj[int(k)]
elif isinstance(obj, dict): obj = obj[k]
else:
if k in ["weight_g", "weight_v"]:
parent, skip = obj, True
if k == "weight_g": weight_g = v
else: weight_v = v
if not skip:
parent = obj
obj = getattr(obj, k)
if weight_g and weight_v:
setattr(obj, "weight_g", weight_g.numpy())
setattr(obj, "weight_v", weight_v.numpy())
obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
weight_g, weight_v, parent, skip = None, None, None, False
if not skip and obj.shape == v.shape:
if "feature_extractor" in key and (isinstance(parent, (nn.GroupNorm, nn.LayerNorm))): # cast
obj.assign(v.to(obj.device).float())
else:
obj.assign(v.to(obj.device))
elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
except Exception as e: raise e
logging.info(f"Loaded checkpoint '{checkpoint_path}' in {time.time() - start_time:.4f}s")
return model, optimizer
def pad_array(arr, target_length):
current_length = arr.shape[0]
if current_length >= target_length: return arr
pad_width = target_length - current_length
pad_left = pad_width // 2
pad_right = pad_width - pad_left
padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
return padded_arr
def split_list_by_n(list_collection, n, pre=0):
for i in range(0, len(list_collection), n):
yield list_collection[i-pre if i-pre>=0 else i: i + n]
def get_sid(spk2id:HParams, speaker:str) -> Tensor:
speaker_id = spk2id[speaker]
if not speaker_id and type(speaker) is int:
if len(spk2id.__dict__) >= speaker: speaker_id = speaker
if speaker_id is None: raise RuntimeError(f"speaker={speaker} not in the speaker list")
return Tensor([int(speaker_id)], dtype=dtypes.int64).unsqueeze(0)
def get_encoder(ssl_dim) -> Type[SpeechEncoder]:
if ssl_dim == 256: return ContentVec256L9
if ssl_dim == 768: return ContentVec768L12
#########################################################################################
# CODE: https://github.com/svc-develop-team/so-vits-svc
#########################################################################################
# CONTENTVEC:
# CODE: https://github.com/auspicious3000/contentvec
# PAPER: https://arxiv.org/abs/2204.09224
#########################################################################################
# INSTALLATION: dependencies are for preprocessing and loading/saving audio.
# pip3 install soundfile librosa praat-parselmouth
#########################################################################################
# EXAMPLE USAGE:
# python3 examples/so_vits_svc.py --model tf2spy --file ~/recording.wav
#########################################################################################
# DEMO USAGE (uses audio sample from LJ-Speech):
# python3 examples/so_vits_svc.py --model saul_goodman
#########################################################################################
SO_VITS_SVC_PATH = Path(__file__).parents[1] / "weights/So-VITS-SVC"
VITS_MODELS = { # config_path, weights_path, config_url, weights_url
"saul_goodman" : (SO_VITS_SVC_PATH / "config_saul_gman.json", SO_VITS_SVC_PATH / "pretrained_saul_gman.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/Saul_Goodman_80000/G_80000.pth"),
"drake" : (SO_VITS_SVC_PATH / "config_drake.json", SO_VITS_SVC_PATH / "pretrained_drake.pth", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/config_aubrey.json", "https://huggingface.co/jaspa/so-vits-svc/resolve/main/aubrey/pretrained_aubrey.pth"),
"cartman" : (SO_VITS_SVC_PATH / "config_cartman.json", SO_VITS_SVC_PATH / "pretrained_cartman.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/EricCartman/G_10200.pth"),
"tf2spy" : (SO_VITS_SVC_PATH / "config_tf2spy.json", SO_VITS_SVC_PATH / "pretrained_tf2spy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_spy_60k/G_60000.pth"),
"tf2heavy" : (SO_VITS_SVC_PATH / "config_tf2heavy.json", SO_VITS_SVC_PATH / "pretrained_tf2heavy.pth", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/config.json", "https://huggingface.co/Amo/so-vits-svc-4.0_GA/resolve/main/ModelsFolder/TF2_heavy_100k/G_100000.pth"),
"lady_gaga" : (SO_VITS_SVC_PATH / "config_gaga.json", SO_VITS_SVC_PATH / "pretrained_gaga.pth", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/config.json", "https://huggingface.co/marcoc2/so-vits-svc-4.0-models/resolve/main/LadyGaga/G_14400.pth")
}
ENCODER_MODELS = { # weights_path, weights_url
"contentvec": (SO_VITS_SVC_PATH / "contentvec_checkpoint.pt", "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
}
ENCODER_MODEL = "contentvec"
DEMO_PATH, DEMO_URL = Path(__file__).parents[1] / "temp/LJ037-0171.wav", "https://keithito.com/LJ-Speech-Dataset/LJ037-0171.wav"
if __name__=="__main__":
logging.basicConfig(stream=sys.stdout, level=(logging.INFO if DEBUG < 1 else logging.DEBUG))
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", default=None, help=f"Specify the model to use. All supported models: {VITS_MODELS.keys()}", required=True)
parser.add_argument("-f", "--file", default=DEMO_PATH, help=f"Specify the path of the input file")
parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
parser.add_argument("--speaker", default=None, help="If not specified, the first available speaker is chosen. Usually there is only one speaker per model.")
parser.add_argument("--noise_scale", default=0.4)
parser.add_argument("--tran", default=0.0, help="Pitch shift, supports positive and negative (semitone) values. Default 0.0")
parser.add_argument("--pad_seconds", default=0.5)
parser.add_argument("--lg_num", default=0.0)
parser.add_argument("--clip_seconds", default=0.0)
parser.add_argument("--slice_db", default=-40)
args = parser.parse_args()
vits_model = args.model
encoder_location, vits_location = ENCODER_MODELS[ENCODER_MODEL], VITS_MODELS[vits_model]
Tensor.training = False
# Get Synthesizer and ContentVec
net_g, hps = Synthesizer.load_from_pretrained(vits_location[0], vits_location[2], vits_location[1], vits_location[3])
Encoder = get_encoder(hps.model.ssl_dim)
encoder = Encoder.load_from_pretrained(encoder_location[0], encoder_location[1])
# model config args
target_sample, spk2id, hop_length, target_sample = hps.data.sampling_rate, hps.spk, hps.data.hop_length, hps.data.sampling_rate
vol_embedding = hps.model.vol_embedding if hasattr(hps.data, "vol_embedding") and hps.model.vol_embedding is not None else False
# args
slice_db, clip_seconds, lg_num, pad_seconds, tran, noise_scale, audio_path = args.slice_db, args.clip_seconds, args.lg_num, args.pad_seconds, args.tran, args.noise_scale, args.file
speaker = args.speaker if args.speaker is not None else list(hps.spk.__dict__.keys())[0]
### Loading audio and slicing ###
if audio_path == DEMO_PATH: fetch(DEMO_URL, DEMO_PATH)
assert Path(audio_path).is_file() and Path(audio_path).suffix == ".wav"
chunks = preprocess.cut(audio_path, db_thresh=slice_db)
audio_data, audio_sr = preprocess.chunks2audio(audio_path, chunks)
per_size = int(clip_seconds * audio_sr)
lg_size = int(lg_num * audio_sr)
### Infer per slice ###
global_frame = 0
audio = []
for (slice_tag, data) in audio_data:
print(f"\n====segment start, {round(len(data) / audio_sr, 3)}s====")
length = int(np.ceil(len(data) / audio_sr * target_sample))
if slice_tag:
print("empty segment")
_audio = np.zeros(length)
audio.extend(list(pad_array(_audio, length)))
global_frame += length // hop_length
continue
datas = [data] if per_size == 0 else split_list_by_n(data, per_size, lg_size)
for k, dat in enumerate(datas):
per_length = int(np.ceil(len(dat) / audio_sr * target_sample)) if clip_seconds!=0 else length
pad_len = int(audio_sr * pad_seconds)
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
raw_path = io.BytesIO()
soundfile.write(raw_path, dat, audio_sr, format="wav")
raw_path.seek(0)
### Infer START ###
wav, sr = preprocess.load_audiofile(raw_path)
wav = preprocess.sinc_interp_resample(wav, sr, target_sample)[0]
wav16k, f0, uv = preprocess.get_unit_f0(wav, tran, hop_length, target_sample)
sid = get_sid(spk2id, speaker)
n_frames = f0.shape[1]
# ContentVec infer
start = time.time()
c = encoder.encode(wav16k)
c = repeat_expand_2d_left(c.squeeze(0).realize(), f0.shape[1]) # interpolate speech encoding to match f0
c = c.unsqueeze(0).realize()
enc_time = time.time() - start
# VITS infer
vits_start = time.time()
out_audio, f0 = net_g.infer(c, f0=f0, uv=uv, g=sid, noise_scale=noise_scale, vol=None)
out_audio = out_audio[0,0].float().realize()
vits_time = time.time() - vits_start
infer_time = time.time() - start
logging.info("total infer time:{:.2f}s, speech_enc time:{:.2f}s, vits time:{:.2f}s".format(infer_time, enc_time, vits_time))
### Infer END ###
out_sr, out_frame = out_audio.shape[-1], n_frames
global_frame += out_frame
_audio = out_audio.numpy()
pad_len = int(target_sample * pad_seconds)
_audio = _audio[pad_len:-pad_len]
_audio = pad_array(_audio, per_length)
audio.extend(list(_audio))
audio = np.array(audio)
out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model}{f'_spk_{speaker}'}_{args.base_name}.wav")
out_path.parent.mkdir(parents=True, exist_ok=True)
soundfile.write(out_path, audio, target_sample, format="flac")
logging.info(f"Saved audio output to {out_path}")

View File

@@ -1,204 +0,0 @@
import math
from typing import Optional, Tuple
from tinygrad import Tensor, dtypes
import librosa
import soundfile
import numpy as np
import parselmouth
class PMF0Predictor: # from https://github.com/svc-develop-team/so-vits-svc/
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
self.hop_length, self.f0_min, self.f0_max, self.sampling_rate, self.name = hop_length, f0_min, f0_max, sampling_rate, "pm"
def interpolate_f0(self,f0):
vuv_vector = np.zeros_like(f0, dtype=np.float32)
vuv_vector[f0 > 0.0] = 1.0
vuv_vector[f0 <= 0.0] = 0.0
nzindex = np.nonzero(f0)[0]
data = f0[nzindex]
nzindex = nzindex.astype(np.float32)
time_org = self.hop_length / self.sampling_rate * nzindex
time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
if data.shape[0] <= 0: return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
if data.shape[0] == 1: return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
return f0,vuv_vector
def compute_f0(self,wav,p_len=None):
x = wav
if p_len is None: p_len = x.shape[0]//self.hop_length
else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = parselmouth.Sound(x, self.sampling_rate) \
.to_pitch_ac(time_step=time_step / 1000, voicing_threshold=0.6,pitch_floor=self.f0_min, pitch_ceiling=self.f0_max) \
.selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
f0,uv = self.interpolate_f0(f0)
return f0
def compute_f0_uv(self,wav,p_len=None):
x = wav
if p_len is None: p_len = x.shape[0]//self.hop_length
else: assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error"
time_step = self.hop_length / self.sampling_rate * 1000
f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
f0,uv = self.interpolate_f0(f0)
return f0,uv
class Slicer: # from https://github.com/svc-develop-team/so-vits-svc/
def __init__(self, sr: int, threshold: float = -40., min_length: int = 5000, min_interval: int = 300, hop_size: int = 20, max_sil_kept: int = 5000):
if not min_length >= min_interval >= hop_size:
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
if not max_sil_kept >= hop_size:
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1: return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
else: return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
def slice(self, waveform):
samples = librosa.to_mono(waveform) if len(waveform.shape) > 1 else waveform
if samples.shape[0] <= self.min_length: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
sil_tags, silence_start, clip_start = [], None, 0
for i, rms in enumerate(rms_list):
if rms < self.threshold: # Keep looping while frame is silent.
if silence_start is None: # Record start of silent frames.
silence_start = i
continue
if silence_start is None: continue # Keep looping while frame is not silent and silence start has not been recorded.
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
if i - silence_start <= self.max_sil_kept: # Need slicing. Record the range of silent frames to be removed.
pos = rms_list[silence_start: i + 1].argmin() + silence_start
sil_tags.append((0, pos) if silence_start == 0 else (pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
pos += i - self.max_sil_kept
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
sil_tags.append((0, pos_r) if silence_start == 0 else (pos_l, pos_r))
clip_start = pos_r
silence_start = None
total_frames = rms_list.shape[0]
if silence_start is not None and total_frames - silence_start >= self.min_interval: # Deal with trailing silence.
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
if len(sil_tags) == 0: return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}} # Apply and return slices.
chunks = []
if sil_tags[0][0]:
chunks.append({"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
for i in range(0, len(sil_tags)):
if i: chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
chunks.append({"slice": True, "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
if sil_tags[-1][1] * self.hop_size < len(waveform):
chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
chunk_dict = {}
for i in range(len(chunks)): chunk_dict[str(i)] = chunks[i]
return chunk_dict
# sinc_interp_hann audio resampling
class Resample:
def __init__(self, orig_freq:int=16000, new_freq:int=16000, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None, dtype:Optional[dtypes]=None):
self.orig_freq, self.new_freq, self.lowpass_filter_width, self.rolloff, self.beta = orig_freq, new_freq, lowpass_filter_width, rolloff, beta
self.gcd = math.gcd(int(self.orig_freq), int(self.new_freq))
self.kernel, self.width = self._get_sinc_resample_kernel(dtype) if self.orig_freq != self.new_freq else (None, None)
def __call__(self, waveform:Tensor) -> Tensor:
if self.orig_freq == self.new_freq: return waveform
return self._apply_sinc_resample_kernel(waveform)
def _apply_sinc_resample_kernel(self, waveform:Tensor):
if not waveform.is_floating_point(): raise TypeError(f"Waveform tensor expected to be of type float, but received {waveform.dtype}.")
orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
shape = waveform.shape
waveform = waveform.reshape(-1, shape[-1]) # pack batch
num_wavs, length = waveform.shape
target_length = int(math.ceil(new_freq * length / orig_freq))
waveform = waveform.pad((self.width, self.width + orig_freq))
resampled = waveform[:, None].conv2d(self.kernel, stride=orig_freq)
resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
resampled = resampled[..., :target_length]
resampled = resampled.reshape(shape[:-1] + resampled.shape[-1:]) # unpack batch
return resampled
def _get_sinc_resample_kernel(self, dtype=None):
orig_freq, new_freq = (int(self.orig_freq) // self.gcd), (int(self.new_freq) // self.gcd)
if self.lowpass_filter_width <= 0: raise ValueError("Low pass filter width should be positive.")
base_freq = min(orig_freq, new_freq)
base_freq *= self.rolloff
width = math.ceil(self.lowpass_filter_width * orig_freq / base_freq)
idx = Tensor.arange(-width, width + orig_freq, dtype=(dtype if dtype is not None else dtypes.float32))[None, None] / orig_freq
t = Tensor.arange(0, -new_freq, -1, dtype=dtype)[:, None, None] / new_freq + idx
t *= base_freq
t = t.clip(-self.lowpass_filter_width, self.lowpass_filter_width)
window = (t * math.pi / self.lowpass_filter_width / 2).cos() ** 2
t *= math.pi
scale = base_freq / orig_freq
kernels = Tensor.where(t == 0, Tensor(1.0, dtype=t.dtype).to(t.device), t.sin() / t)
kernels *= window * scale
if dtype is None: kernels = kernels.cast(dtype=dtypes.float32)
return kernels, width
def sinc_interp_resample(x:Tensor, orig_freq:int=16000, new_freq:int=1600, lowpass_filter_width:int=6, rolloff:float=0.99, beta:Optional[float]=None):
resamp = Resample(orig_freq, new_freq, lowpass_filter_width, rolloff, beta, x.dtype)
return resamp(x)
def cut(audio_path, db_thresh=-30, min_len=5000):
audio, sr = librosa.load(audio_path, sr=None)
slicer = Slicer(sr=sr, threshold=db_thresh, min_length=min_len)
chunks = slicer.slice(audio)
return chunks
def chunks2audio(audio_path, chunks):
chunks = dict(chunks)
audio, sr = load_audiofile(audio_path)
if len(audio.shape) == 2 and audio.shape[1] >= 2:
audio = audio.mean(0).unsqueeze(0)
audio = audio.numpy()[0]
result = []
for k, v in chunks.items():
tag = v["split_time"].split(",")
if tag[0] != tag[1]:
result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
return result, sr
def load_audiofile(filepath:str, frame_offset:int=0, num_frames:int=-1, channels_first:bool=True):
with soundfile.SoundFile(filepath, "r") as file_:
frames = file_._prepare_read(frame_offset, None, num_frames)
waveform = file_.read(frames, "float32", always_2d=True)
sample_rate = file_.samplerate
waveform = Tensor(waveform)
if channels_first: waveform = waveform.transpose(0, 1)
return waveform, sample_rate
def get_unit_f0(wav:Tensor, tran, hop_length, target_sample, f0_filter=False) -> Tuple[Tensor,Tensor,Tensor]:
f0_predictor = PMF0Predictor(hop_length, sampling_rate=target_sample)
f0, uv = f0_predictor.compute_f0_uv(wav.numpy())
if f0_filter and sum(f0) == 0: raise RuntimeError("No voice detected")
f0 = Tensor(f0.astype(np.float32)).float()
f0 = (f0 * 2 ** (tran / 12)).unsqueeze(0)
uv = Tensor(uv.astype(np.float32)).float().unsqueeze(0)
wav16k = sinc_interp_resample(wav[None,:], target_sample, 16000)[0]
return wav16k.realize(), f0.realize(), uv.realize()

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
from tinygrad import Tensor, Device, GlobalCounters, Context, dtypes
from tinygrad.helpers import getenv, colored
SZ = 8_000_000_000
GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
if __name__ == "__main__":
# create tensors
tens = [Tensor.ones(SZ, dtype=dtypes.uint8, device=f"{Device.DEFAULT}:{i}").contiguous() for i in range(GPUS)]
Tensor.realize(*tens)
bw = [[0.0]*GPUS for _ in range(GPUS)]
for i in range(GPUS):
for j in range(GPUS):
GlobalCounters.reset()
with Context(DEBUG=2):
if i == j:
# this copy would be optimized out, just add 1
(tens[i]+1).realize()
else:
tens[i].to(f"{Device.DEFAULT}:{j}").realize()
t = max(GlobalCounters.time_sum_s, 1e-9)
bw[i][j] = SZ / t / 1e9 # GB/s
def fmt(x):
c = "green" if x > 50 else "yellow" if x > 20 else "red"
return colored(f"{x:6.1f}", c)
# header
print(" " * 8 + " ".join(f"{'d'+str(j):>6}" for j in range(GPUS)))
# rows
for i in range(GPUS):
print(f"{'s'+str(i):>6} -> " + " ".join(fmt(x) for x in bw[i]))

16
examples/tools/gpuburn.py Normal file
View File

@@ -0,0 +1,16 @@
from tinygrad import Tensor, Device, TinyJit, dtypes
from tinygrad.helpers import getenv
GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
N = 6144
@TinyJit
def many_matmul(A, B):
out = A
for _ in range(8): out = out@B
return out
if __name__ == "__main__":
A = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
B = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
while 1: many_matmul(A, B)

View File

@@ -1,104 +0,0 @@
import traceback
import time
from multiprocessing import Process, Queue
import numpy as np
from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim
from tinygrad.helpers import getenv, trange
from tinygrad.tensor import Tensor
from extra.datasets import fetch_cifar
from extra.models.efficientnet import EfficientNet
class TinyConvNet:
def __init__(self, classes=10):
conv = 3
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
self.l1 = Tensor.uniform(out_chan*6*6, classes)
def forward(self, x):
x = x.conv2d(self.c1).relu().max_pool2d()
x = x.conv2d(self.c2).relu().max_pool2d()
x = x.reshape(shape=[x.shape[0], -1])
return x.dot(self.l1)
if __name__ == "__main__":
IMAGENET = getenv("IMAGENET")
classes = 1000 if IMAGENET else 10
TINY = getenv("TINY")
TRANSFER = getenv("TRANSFER")
if TINY:
model = TinyConvNet(classes)
elif TRANSFER:
model = EfficientNet(getenv("NUM", 0), classes, has_se=True)
model.load_from_pretrained()
else:
model = EfficientNet(getenv("NUM", 0), classes, has_se=False)
parameters = get_parameters(model)
print("parameter count", len(parameters))
optimizer = optim.Adam(parameters, lr=0.001)
BS, steps = getenv("BS", 64 if TINY else 16), getenv("STEPS", 2048)
print(f"training with batch size {BS} for {steps} steps")
if IMAGENET:
from extra.datasets.imagenet import fetch_batch
def loader(q):
while 1:
try:
q.put(fetch_batch(BS))
except Exception:
traceback.print_exc()
q = Queue(16)
for i in range(2):
p = Process(target=loader, args=(q,))
p.daemon = True
p.start()
else:
X_train, Y_train, _, _ = fetch_cifar()
X_train = X_train.reshape((-1, 3, 32, 32))
Y_train = Y_train.reshape((-1,))
with Tensor.train():
for i in (t := trange(steps)):
if IMAGENET:
X, Y = q.get(True)
else:
samp = np.random.randint(0, X_train.shape[0], size=(BS))
X, Y = X_train.numpy()[samp], Y_train.numpy()[samp]
st = time.time()
out = model.forward(Tensor(X.astype(np.float32), requires_grad=False))
fp_time = (time.time()-st)*1000.0
y = np.zeros((BS,classes), np.float32)
y[range(y.shape[0]),Y] = -classes
y = Tensor(y, requires_grad=False)
loss = out.log_softmax().mul(y).mean()
optimizer.zero_grad()
st = time.time()
loss.backward()
bp_time = (time.time()-st)*1000.0
st = time.time()
optimizer.step()
opt_time = (time.time()-st)*1000.0
st = time.time()
loss = loss.numpy()
cat = out.argmax(axis=1).numpy()
accuracy = (cat == Y).mean()
finish_time = (time.time()-st)*1000.0
# printing
t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
(loss, accuracy,
fp_time, bp_time, opt_time, finish_time,
fp_time + bp_time + opt_time + finish_time))
del out, y, loss

View File

@@ -1,46 +0,0 @@
import ast
import numpy as np
from PIL import Image
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv, fetch
from extra.models.vit import ViT
"""
fn = "gs://vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz"
import tensorflow as tf
with tf.io.gfile.GFile(fn, "rb") as f:
dat = f.read()
with open("cache/"+ fn.rsplit("/", 1)[1], "wb") as g:
g.write(dat)
"""
Tensor.training = False
if getenv("LARGE", 0) == 1:
m = ViT(embed_dim=768, num_heads=12)
else:
# tiny
m = ViT(embed_dim=192, num_heads=3)
m.load_from_pretrained()
# category labels
lbls = ast.literal_eval(fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt").read_text())
#url = "https://upload.wikimedia.org/wikipedia/commons/4/41/Chicken.jpg"
url = "https://repository-images.githubusercontent.com/296744635/39ba6700-082d-11eb-98b8-cb29fb7369c0"
# junk
img = Image.open(fetch(url))
aspect_ratio = img.size[0] / img.size[1]
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
img = np.array(img)
y0,x0=(np.asarray(img.shape)[:2]-224)//2
img = img[y0:y0+224, x0:x0+224]
img = np.moveaxis(img, [2,0,1], [0,1,2])
img = img.astype(np.float32)[:3].reshape(1,3,224,224)
img /= 255.0
img -= 0.5
img /= 0.5
out = m.forward(Tensor(img))
outnp = out.numpy().ravel()
choice = outnp.argmax()
print(out.shape, choice, outnp[choice], lbls[choice])