remove more stale stuff (#13765)

* remove more stale stuff * remove disassemblers/adreno * stale
2026-01-06 21:53:53 -05:00 · 2025-12-19 17:14:56 -04:00
parent 744af193f0
commit 45c459848d
19 changed files with 1 additions and 7965 deletions
--- a/examples/conversation.py
+++ b/examples/conversation.py
@@ -1,341 +0,0 @@
-import argparse
-import multiprocessing as mp
-import os
-import re
-import sys
-import time
-from contextlib import contextmanager
-from pathlib import Path
-
-import numpy as np
-import pyaudio
-import yaml
-from llama import LLaMa
-from vits import MODELS as VITS_MODELS
-from vits import Y_LENGTH_ESTIMATE_SCALARS, HParams, Synthesizer, TextMapper, get_hparams_from_file, load_model
-from whisper import init_whisper, transcribe_waveform
-from sentencepiece import SentencePieceProcessor
-
-from tinygrad.helpers import Timing, fetch
-from tinygrad import Tensor, dtypes
-
-# Whisper constants
-RATE = 16000
-CHUNK = 1600
-
-# LLaMa constants
-IM_START = 32001
-IM_END = 32002
-
-
-# Functions for encoding prompts to chatml md
-def encode_prompt(spp, k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
-def start_prompt(spp, k): return [IM_START]+spp.encode(f"{k}\n")
-
-def chunks(lst, n):
-  for i in range(0, len(lst), n): yield lst[i:i + n]
-
-def create_fixed_tokenizer():
-  """Function needed for extending tokenizer with additional chat tokens"""
-  import extra.junk.sentencepiece_model_pb2 as spb2
-  tokenizer_path = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/tokenizer.model")
-  if SentencePieceProcessor(model_file=str(tokenizer_path)).vocab_size() != 32003:
-    print("creating fixed tokenizer")
-    mp = spb2.ModelProto()
-    mp.ParseFromString(tokenizer_path.read_bytes())
-    # https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/blob/main/added_tokens.json
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="[PAD]", score=0))
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
-    tokenizer_path.write_bytes(mp.SerializeToString())
-  return tokenizer_path
-
-def llama_prepare(llama: LLaMa, temperature: float, pre_prompt_path: Path) -> tuple[list[int], str, str, str]:
-  """Prepares a llama model from a specified pre-prompt file"""
-  with open(str(pre_prompt_path)) as f:
-    config = yaml.safe_load(f.read())
-  toks = [llama.tokenizer.bos_id()] + encode_prompt(llama.tokenizer, "system", config["pre_prompt"].replace("\n", " "))
-  for i in config["examples"]:
-    toks += encode_prompt(llama.tokenizer, config["user_delim"], i["user_prompt"])
-    toks += encode_prompt(llama.tokenizer, config["resp_delim"], i["resp_prompt"])
-  llama.model(Tensor([toks]), 0, temperature).realize()  # NOTE: outputs are not used
-  return toks, config["user_delim"], config["resp_delim"], len(toks), llama.tokenizer.decode(toks)
-
-def llama_generate(
-  llama: LLaMa,
-  toks: list[int],
-  outputted: str,
-  prompt: str,
-  start_pos: int,
-  user_delim: str,
-  resp_delim: str,
-  temperature=0.7,
-  max_tokens=1000
-):
-  """Generates an output for the specified prompt"""
-  toks += encode_prompt(llama.tokenizer, user_delim, prompt)
-  toks += start_prompt(llama.tokenizer, resp_delim)
-
-  outputted = llama.tokenizer.decode(toks)
-  init_length = len(outputted)
-  for _ in range(max_tokens):
-    token = llama.model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
-    start_pos = len(toks)
-    toks.append(token)
-
-    cur = llama.tokenizer.decode(toks)
-
-    # Print is just for debugging
-    sys.stdout.write(cur[len(outputted):])
-    sys.stdout.flush()
-    outputted = cur
-    if toks[-1] == IM_END: break
-  else:
-    toks.append(IM_END)
-  print() # because the output is flushed
-  return outputted, start_pos, outputted[init_length:].replace("<|im_end|>", "")
-
-def tts(
-  text_to_synthesize: str,
-  synth: Synthesizer,
-  hps: HParams,
-  emotion_embedding: Path,
-  speaker_id: int,
-  model_to_use: str,
-  noise_scale: float,
-  noise_scale_w: float,
-  length_scale: float,
-  estimate_max_y_length: bool,
-  text_mapper: TextMapper,
-  model_has_multiple_speakers: bool,
-  pad_length=600,
-  vits_pad_length=1000
-):
-  if model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
-
-  # Convert the input text to a tensor.
-  stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
-  init_shape = stn_tst.shape
-  assert init_shape[0] < pad_length, "text is too long"
-  x_tst, x_tst_lengths = stn_tst.pad(((0, pad_length - init_shape[0]),), value=1).unsqueeze(0), Tensor([init_shape[0]], dtype=dtypes.int64)
-  sid = Tensor([speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
-
-  # Perform inference.
-  audio_tensor = synth.infer(x_tst, x_tst_lengths, sid, noise_scale, length_scale, noise_scale_w, emotion_embedding=emotion_embedding,
-                             max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[model_to_use] if estimate_max_y_length else None, pad_length=vits_pad_length)[0, 0]
-  # Save the audio output.
-  audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
-  return audio_data
-
-def init_vits(
-  model_to_use: str,
-  emotion_path: Path,
-  speaker_id: int,
-  seed: int,
-):
-  model_config = VITS_MODELS[model_to_use]
-
-  # Load the hyperparameters from the config file.
-  hps = get_hparams_from_file(fetch(model_config[0]))
-
-  # If model has multiple speakers, validate speaker id and retrieve name if available.
-  model_has_multiple_speakers = hps.data.n_speakers > 0
-  if model_has_multiple_speakers:
-    if speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {speaker_id} is invalid for this model.")
-    if hps.__contains__("speakers"): # maps speaker ids to names
-      speakers = hps.speakers
-      if isinstance(speakers, list): speakers = {speaker: i for i, speaker in enumerate(speakers)}
-
-  # Load emotions if any. TODO: find an english model with emotions, this is untested atm.
-  emotion_embedding = None
-  if emotion_path is not None:
-    if emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(emotion_path), dtype=dtypes.int64).unsqueeze(0)
-    else: raise ValueError("Emotion path must be a .npy file.")
-
-  # Load symbols, instantiate TextMapper and clean the text.
-  if hps.__contains__("symbols"): symbols = hps.symbols
-  elif model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
-  else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ")
-  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
-
-  # Load the model.
-  if seed is not None:
-    Tensor.manual_seed(seed)
-    np.random.seed(seed)
-  net_g = load_model(text_mapper.symbols, hps, model_config)
-
-  return net_g, emotion_embedding, text_mapper, hps, model_has_multiple_speakers
-
-@contextmanager
-def output_stream(num_channels: int, sample_rate: int):
-  try:
-    p = pyaudio.PyAudio()
-    stream = p.open(format=pyaudio.paInt16, channels=num_channels, rate=sample_rate, output=True)
-    yield stream
-  except KeyboardInterrupt: pass
-  finally:
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-@contextmanager
-def log_writer():
-  try:
-    logs = []
-    yield logs
-  finally:
-    sep = "="*os.get_terminal_size()[1]
-    print(f"{sep[:-1]}\nCHAT LOG")
-    print(*logs, sep="\n")
-    print(sep)
-
-def listener(q: mp.Queue, event: mp.Event):
-  try:
-    p = pyaudio.PyAudio()
-    stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK)
-    did_print = False
-    while True:
-      data = stream.read(CHUNK) # read data to avoid overflow
-      if event.is_set():
-        if not did_print:
-          print("listening")
-          did_print = True
-        q.put(((np.frombuffer(data, np.int16)/32768).astype(np.float32)*3))
-      else:
-        did_print = False
-  finally:
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_rate: int):
-  with output_stream(num_channels, sample_rate) as stream:
-    while True:
-      try:
-        stream.write(q.get())
-        counter.value += 1
-      except KeyboardInterrupt:
-        break
-
-if __name__ == "__main__":
-  import nltk
-  nltk.download("punkt")
-  # Parse CLI arguments
-  parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad")
-
-  # Whisper args
-  parser.add_argument("--whisper_model_name", type=str, default="tiny.en")
-
-  # LLAMA args
-  parser.add_argument("--llama_pre_prompt_path", type=Path, default=Path(__file__).parent / "conversation_data" / "pre_prompt_stacy.yaml", help="Path to yaml file which contains all pre-prompt data needed. ")
-  parser.add_argument("--llama_count", type=int, default=1000, help="Max number of tokens to generate")
-  parser.add_argument("--llama_temperature", type=float, default=0.7, help="Temperature in the softmax")
-  parser.add_argument("--llama_quantize", type=str, default=None, help="Quantize the weights to int8 or nf4 in memory")
-  parser.add_argument("--llama_model", type=Path, default=None, help="Folder with the original weights to load, or single .index.json, .safetensors or .bin file")
-  parser.add_argument("--llama_gen", type=str, default="tiny", required=False, help="Generation of the model to use")
-  parser.add_argument("--llama_size", type=str, default="1B-Chat", required=False, help="Size of model to use")
-  parser.add_argument("--llama_tokenizer", type=Path, default=None, required=False, help="Path to llama tokenizer.model")
-
-  # vits args
-  parser.add_argument("--vits_model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
-  parser.add_argument("--vits_speaker_id", type=int, default=12, help="Specify the speaker ID. Default is 6.")
-  parser.add_argument("--vits_noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
-  parser.add_argument("--vits_noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
-  parser.add_argument("--vits_length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
-  parser.add_argument("--vits_seed", type=int, default=None, help="Specify the seed (set to None if no seed). Default is 1337.")
-  parser.add_argument("--vits_num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
-  parser.add_argument("--vits_sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
-  parser.add_argument("--vits_emotion_path", type=Path, default=None, help="Specify the path to emotion reference.")
-  parser.add_argument("--vits_estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
-  parser.add_argument("--vits_vocab_path", type=Path, default=None, help="Path to the TTS vocabulary.")
-
-  # conversation args
-  parser.add_argument("--max_sentence_length", type=int, default=20, help="Max words in one sentence to pass to vits")
-
-  args = parser.parse_args()
-
-  # Init models
-  model, enc = init_whisper(args.whisper_model_name)
-  synth, emotion_embedding, text_mapper, hps, model_has_multiple_speakers = init_vits(args.vits_model_to_use, args.vits_emotion_path, args.vits_speaker_id, args.vits_seed)
-
-  # Download tinyllama chat as a default model
-  if args.llama_model is None:
-    args.llama_model = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/model.safetensors", "tinyllamachat.safetensors")
-    args.llama_gen = "tiny"
-    args.llama_size = "1B-Chat"
-  # Add 3 more tokens to the tokenizer
-  if args.llama_gen == "tiny" and args.llama_size.endswith("Chat"): args.llama_tokenizer = create_fixed_tokenizer()
-  tokenizer_path = args.llama_tokenizer or args.llama_model.parent / "tokenizer.model"
-  llama = LLaMa.build(args.llama_model, tokenizer_path, args.llama_gen, args.llama_size, args.llama_quantize)
-  toks, user_delim, resp_delim, start_pos, outputted = llama_prepare(llama, args.llama_temperature, args.llama_pre_prompt_path)
-
-  # Start child process for mic input
-  q = mp.Queue()
-  is_listening_event = mp.Event()
-  p = mp.Process(target=listener, args=(q, is_listening_event,))
-  p.daemon = True
-  p.start()
-
-  # Start child process for speaker output
-  out_q = mp.Queue()
-  out_counter = mp.Value("i", 0)
-  out_p = mp.Process(target=mp_output_stream, args=(out_q, out_counter, args.vits_num_channels, hps.data.sampling_rate,))
-  out_p.daemon = True
-  out_p.start()
-
-  # JIT tts
-  for i in ["Hello, I'm a chat bot", "I am capable of doing a lot of things"]:
-    tts(
-      i, synth, hps, emotion_embedding,
-      args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
-      args.vits_noise_scale_w, args.vits_length_scale,
-      args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
-    )
-
-  # Start the pipeline
-  with log_writer() as log:
-    while True:
-      tokens = [enc._special_tokens["<|startoftranscript|>"], enc._special_tokens["<|notimestamps|>"]]
-      total = np.array([])
-      out_counter.value = 0
-
-      s = time.perf_counter()
-      is_listening_event.set()
-      prev_text = None
-      while True:
-        for _ in range(RATE // CHUNK): total = np.concatenate([total, q.get()])
-        txt = transcribe_waveform(model, enc, [total], truncate=True)
-        print(txt, end="\r")
-        if txt == "[BLANK_AUDIO]" or re.match(r"^\([\w+ ]+\)$", txt.strip()): continue
-        if prev_text is not None and prev_text == txt:
-          is_listening_event.clear()
-          break
-        prev_text = txt
-      print() # to avoid llama printing on the same line
-      log.append(f"{user_delim.capitalize()}: {txt}")
-
-      # Generate with llama
-      with Timing("llama generation: "):
-        outputted, start_pos, response = llama_generate(
-          llama, toks, outputted, txt, start_pos,
-          user_delim=user_delim, resp_delim=resp_delim, temperature=args.llama_temperature,
-          max_tokens=args.llama_count
-        )
-        log.append(f"{resp_delim.capitalize()}: {response}")
-
-      # Convert to voice
-      with Timing("tts: "):
-        sentences = nltk.sent_tokenize(response.replace('"', ""))
-        for i in sentences:
-          total = np.array([], dtype=np.int16)
-          for j in chunks(i.split(), args.max_sentence_length):
-            audio_data = tts(
-              " ".join(j), synth, hps, emotion_embedding,
-              args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
-              args.vits_noise_scale_w, args.vits_length_scale,
-              args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
-            )
-            total = np.concatenate([total, audio_data])
-          out_q.put(total.tobytes())
-      while out_counter.value < len(sentences): continue
-      log.append(f"Total: {time.perf_counter() - s}")
--- a/examples/mlperf/model_eval.py
+++ b/examples/mlperf/model_eval.py
@@ -204,43 +204,6 @@ def eval_bert():

    st = time.perf_counter()

-def eval_mrcnn():
-  from tqdm import tqdm
-  from extra.models.mask_rcnn import MaskRCNN
-  from extra.models.resnet import ResNet
-  from extra.datasets.coco import BASEDIR, images, convert_prediction_to_coco_bbox, convert_prediction_to_coco_mask, accumulate_predictions_for_coco, evaluate_predictions_on_coco, iterate
-  from examples.mask_rcnn import compute_prediction_batched, Image
-  mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True))
-  mdl.load_from_pretrained()
-
-  bbox_output = '/tmp/results_bbox.json'
-  mask_output = '/tmp/results_mask.json'
-
-  accumulate_predictions_for_coco([], bbox_output, rm=True)
-  accumulate_predictions_for_coco([], mask_output, rm=True)
-
-  #TODO: bs > 1 not as accurate
-  bs = 1
-
-  for batch in tqdm(iterate(images, bs=bs), total=len(images)//bs):
-    batch_imgs = []
-    for image_row in batch:
-      image_name = image_row['file_name']
-      img = Image.open(BASEDIR/f'val2017/{image_name}').convert("RGB")
-      batch_imgs.append(img)
-    batch_result = compute_prediction_batched(batch_imgs, mdl)
-    for image_row, result in zip(batch, batch_result):
-      image_name = image_row['file_name']
-      box_pred = convert_prediction_to_coco_bbox(image_name, result)
-      mask_pred = convert_prediction_to_coco_mask(image_name, result)
-      accumulate_predictions_for_coco(box_pred, bbox_output)
-      accumulate_predictions_for_coco(mask_pred, mask_output)
-    del batch_imgs
-    del batch_result
-
-  evaluate_predictions_on_coco(bbox_output, iou_type='bbox')
-  evaluate_predictions_on_coco(mask_output, iou_type='segm')
-
 def eval_llama3():
  from extra.models.llama import Transformer
  from examples.llama3 import MODEL_PARAMS, load, convert_from_huggingface
@@ -541,7 +504,7 @@ if __name__ == "__main__":
  # inference only
  Tensor.training = False

-  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",")
+  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert").split(",")
  for m in models:
    nm = f"eval_{m}"
    if nm in globals():
--- a/examples/vits.py
+++ b/examples/vits.py
@@ -1,740 +0,0 @@
-import json, logging, math, re, sys, time, wave, argparse, numpy as np
-from phonemizer.phonemize import default_separator, _phonemize
-from phonemizer.backend import EspeakBackend
-from phonemizer.punctuation import Punctuation
-from functools import reduce
-from pathlib import Path
-from typing import List
-from tinygrad import nn, dtypes
-from tinygrad.helpers import fetch
-from tinygrad.nn.state import torch_load
-from tinygrad.tensor import Tensor
-from tinygrad.engine.jit import TinyJit
-from unidecode import unidecode
-
-LRELU_SLOPE = 0.1
-
-class Synthesizer:
-  def __init__(self, n_vocab, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, gin_channels=0, use_sdp=True, emotion_embedding=False, **kwargs):
-    self.n_vocab, self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.use_sdp = n_vocab, spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, use_sdp
-    self.enc_p = TextEncoder(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding)
-    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
-    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
-    self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) if use_sdp else DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
-    if n_speakers > 1: self.emb_g = nn.Embedding(n_speakers, gin_channels)
-  def infer(self, x, x_lengths, sid=None, noise_scale=1.0, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None, max_y_length_estimate_scale=None, pad_length=-1):
-    x, m_p, logs_p, x_mask = self.enc_p.forward(x.realize(), x_lengths.realize(), emotion_embedding.realize() if emotion_embedding is not None else emotion_embedding)
-    g = self.emb_g(sid.reshape(1, 1)).squeeze(1).unsqueeze(-1) if self.n_speakers > 0 else None
-    logw = self.dp.forward(x, x_mask.realize(), g=g.realize(), reverse=self.use_sdp, noise_scale=noise_scale_w if self.use_sdp else 1.0)
-    w_ceil = Tensor.ceil(logw.exp() * x_mask * length_scale)
-    y_lengths = Tensor.maximum(w_ceil.sum([1, 2]), 1).cast(dtypes.int64)
-    return self.generate(g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length)
-  def generate(self, g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length):
-    max_y_length = y_lengths.max().item() if max_y_length_estimate_scale is None else max(15, x.shape[-1]) * max_y_length_estimate_scale
-    y_mask = sequence_mask(y_lengths, max_y_length).unsqueeze(1).cast(x_mask.dtype)
-    attn_mask = x_mask.unsqueeze(2) * y_mask.unsqueeze(-1)
-    attn = generate_path(w_ceil, attn_mask)
-    m_p_2 = attn.squeeze(1).matmul(m_p.transpose(1, 2)).transpose(1, 2)        # [b, t', t], [b, t, d] -> [b, d, t']
-    logs_p_2 = attn.squeeze(1).matmul(logs_p.transpose(1, 2)).transpose(1, 2)  # [b, t', t], [b, t, d] -> [b, d, t']
-    z_p = m_p_2 + Tensor.randn(*m_p_2.shape, dtype=m_p_2.dtype) * logs_p_2.exp() * noise_scale
-    row_len = y_mask.shape[2]
-    if pad_length > -1:
-      # Pad flow forward inputs to enable JIT
-      assert pad_length > row_len, "pad length is too small"
-      y_mask = y_mask.pad(((0, 0), (0, 0), (0, pad_length - row_len))).cast(z_p.dtype)
-      # New y_mask tensor to remove sts mask
-      y_mask = Tensor(y_mask.numpy(), device=y_mask.device, dtype=y_mask.dtype, requires_grad=y_mask.requires_grad)
-      z_p = z_p.squeeze(0).pad(((0, 0), (0, pad_length - z_p.shape[2])), value=1).unsqueeze(0)
-    z = self.flow.forward(z_p.realize(), y_mask.realize(), g=g.realize(), reverse=True)
-    result_length = reduce(lambda x, y: x * y, self.dec.upsample_rates, row_len)
-    o = self.dec.forward((z * y_mask)[:, :, :max_len], g=g)[:, :, :result_length]
-    if max_y_length_estimate_scale is not None:
-      length_scaler = o.shape[-1] / max_y_length
-      o.realize()
-      real_max_y_length = y_lengths.max().numpy()
-      if real_max_y_length > max_y_length:
-        logging.warning(f"Underestimated max length by {(((real_max_y_length / max_y_length) * 100) - 100):.2f}%, recomputing inference without estimate...")
-        return self.generate(g, logs_p, m_p, max_len, None, noise_scale, w_ceil, x, x_mask, y_lengths)
-      if real_max_y_length < max_y_length:
-        overestimation = ((max_y_length / real_max_y_length) * 100) - 100
-        logging.info(f"Overestimated max length by {overestimation:.2f}%")
-        if overestimation > 10: logging.warning("Warning: max length overestimated by more than 10%")
-      o = o[:, :, :(real_max_y_length * length_scaler).astype(np.int32)]
-    return o
-
-class StochasticDurationPredictor:
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
-    filter_channels = in_channels # it needs to be removed from future version.
-    self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.n_flows, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, n_flows, gin_channels
-    self.log_flow, self.flows = Log(), [ElementwiseAffine(2)]
-    for _ in range(n_flows):
-      self.flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.flows.append(Flip())
-    self.post_pre, self.post_proj = nn.Conv1d(1, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
-    self.post_convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    self.post_flows = [ElementwiseAffine(2)]
-    for _ in range(4):
-      self.post_flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.post_flows.append(Flip())
-    self.pre, self.proj = nn.Conv1d(in_channels, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-  @TinyJit
-  def forward(self, x: Tensor, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
-    x = self.pre(x.detach())
-    if g is not None: x = x + self.cond(g.detach())
-    x = self.convs.forward(x, x_mask)
-    x = self.proj(x) * x_mask
-    if not reverse:
-      flows = self.flows
-      assert w is not None
-      log_det_tot_q = 0
-      h_w = self.post_proj(self.post_convs.forward(self.post_pre(w), x_mask)) * x_mask
-      e_q = Tensor.randn(w.size(0), 2, w.size(2), dtype=x.dtype).to(device=x.device) * x_mask
-      z_q = e_q
-      for flow in self.post_flows:
-        z_q, log_det_q = flow.forward(z_q, x_mask, g=(x + h_w))
-        log_det_tot_q += log_det_q
-      z_u, z1 = z_q.split([1, 1], 1)
-      u = z_u.sigmoid() * x_mask
-      z0 = (w - u) * x_mask
-      log_det_tot_q += Tensor.sum((z_u.logsigmoid() + (-z_u).logsigmoid()) * x_mask, [1,2])
-      log_q = Tensor.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - log_det_tot_q
-      log_det_tot = 0
-      z0, log_det = self.log_flow.forward(z0, x_mask)
-      log_det_tot += log_det
-      z = z0.cat(z1, 1)
-      for flow in flows:
-        z, log_det = flow.forward(z, x_mask, g=x, reverse=reverse)
-        log_det_tot = log_det_tot + log_det
-      nll = Tensor.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - log_det_tot
-      return (nll + log_q).realize() # [b]
-    flows = list(reversed(self.flows))
-    flows = flows[:-2] + [flows[-1]] # remove a useless vflow
-    z = Tensor.randn(x.shape[0], 2, x.shape[2], dtype=x.dtype).to(device=x.device) * noise_scale
-    for flow in flows: z = flow.forward(z, x_mask, g=x, reverse=reverse)
-    z0, z1 = z.split([1, 1], 1)
-    return z0.realize()
-
-class DurationPredictor:
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
-    self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, gin_channels
-    self.conv_1, self.norm_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
-    self.conv_2, self.norm_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
-    self.proj = nn.Conv1d(filter_channels, 1, 1)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-  def forward(self, x: Tensor, x_mask, g=None):
-    x = x.detach()
-    if g is not None: x = x + self.cond(g.detach())
-    x = self.conv_1(x * x_mask).relu()
-    x = self.norm_1(x).dropout(self.p_dropout)
-    x = self.conv_2(x * x_mask).relu(x)
-    x = self.norm_2(x).dropout(self.p_dropout)
-    return self.proj(x * x_mask) * x_mask
-
-class TextEncoder:
-  def __init__(self, n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding):
-    self.n_vocab, self.out_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout = n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-    if n_vocab!=0:self.emb = nn.Embedding(n_vocab, hidden_channels)
-    if emotion_embedding: self.emo_proj = nn.Linear(1024, hidden_channels)
-    self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
-    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-  @TinyJit
-  def forward(self, x: Tensor, x_lengths: Tensor, emotion_embedding=None):
-    if self.n_vocab!=0: x = (self.emb(x) * math.sqrt(self.hidden_channels))
-    if emotion_embedding: x = x + self.emo_proj(emotion_embedding).unsqueeze(1)
-    x = x.transpose(1, -1)  # [b, t, h] -transpose-> [b, h, t]
-    x_mask = sequence_mask(x_lengths, x.shape[2]).unsqueeze(1).cast(x.dtype)
-    x = self.encoder.forward(x * x_mask, x_mask)
-    m, logs = (self.proj(x) * x_mask).split(self.out_channels, dim=1)
-    return x.realize(), m.realize(), logs.realize(), x_mask.realize()
-
-class ResidualCouplingBlock:
-  def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
-    self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.n_flows, self.gin_channels = channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows, gin_channels
-    self.flows = []
-    for _ in range(n_flows):
-      self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
-      self.flows.append(Flip())
-  @TinyJit
-  def forward(self, x, x_mask, g=None, reverse=False):
-    for flow in reversed(self.flows) if reverse else self.flows: x = flow.forward(x, x_mask, g=g, reverse=reverse)
-    return x.realize()
-
-class PosteriorEncoder:
-  def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
-    self.in_channels, self.out_channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels = in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels
-    self.pre, self.proj = nn.Conv1d(in_channels, hidden_channels, 1), nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
-  def forward(self, x, x_lengths, g=None):
-    x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).cast(x.dtype)
-    stats = self.proj(self.enc.forward(self.pre(x) * x_mask, x_mask, g=g)) * x_mask
-    m, logs = stats.split(self.out_channels, dim=1)
-    z = (m + Tensor.randn(m.shape, m.dtype) * logs.exp()) * x_mask
-    return z, m, logs, x_mask
-
-class Generator:
-  def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
-    self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
-    self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-    resblock = ResBlock1 if resblock == '1' else ResBlock2
-    self.ups = [nn.ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), k, u, padding=(k-u)//2) for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes))]
-    self.resblocks = []
-    self.upsample_rates = upsample_rates
-    for i in range(len(self.ups)):
-      ch = upsample_initial_channel // (2 ** (i + 1))
-      for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-        self.resblocks.append(resblock(ch, k, d))
-    self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-  @TinyJit
-  def forward(self, x: Tensor, g=None):
-    x = self.conv_pre(x)
-    if g is not None:  x = x + self.cond(g)
-    for i in range(self.num_upsamples):
-      x = self.ups[i](x.leaky_relu(LRELU_SLOPE))
-      xs = sum(self.resblocks[i * self.num_kernels + j].forward(x) for j in range(self.num_kernels))
-      x = (xs / self.num_kernels).realize()
-    res = self.conv_post(x.leaky_relu()).tanh().realize()
-    return res
-
-class LayerNorm(nn.LayerNorm):
-  def __init__(self, channels, eps=1e-5): super().__init__(channels, eps, elementwise_affine=True)
-  def forward(self, x: Tensor): return self.__call__(x.transpose(1, -1)).transpose(1, -1)
-
-class WN:
-  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-    assert (kernel_size % 2 == 1)
-    self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels, self.p_dropout = hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels, p_dropout
-    self.in_layers, self.res_skip_layers = [], []
-    if gin_channels != 0: self.cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-    for i in range(n_layers):
-      dilation = dilation_rate ** i
-      self.in_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=int((kernel_size * dilation - dilation) / 2)))
-      self.res_skip_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels if i < n_layers - 1 else hidden_channels, 1))
-  def forward(self, x, x_mask, g=None, **kwargs):
-    output = Tensor.zeros_like(x)
-    if g is not None: g = self.cond_layer(g)
-    for i in range(self.n_layers):
-      x_in = self.in_layers[i](x)
-      if g is not None:
-        cond_offset = i * 2 * self.hidden_channels
-        g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
-      else:
-        g_l = Tensor.zeros_like(x_in)
-      acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels)
-      res_skip_acts = self.res_skip_layers[i](acts)
-      if i < self.n_layers - 1:
-        x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
-        output = output + res_skip_acts[:, self.hidden_channels:, :]
-      else:
-        output = output + res_skip_acts
-    return output * x_mask
-
-class ResBlock1:
-  def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-    self.convs1 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(3)]
-    self.convs2 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)) for _ in range(3)]
-  def forward(self, x: Tensor, x_mask=None):
-    for c1, c2 in zip(self.convs1, self.convs2):
-      xt = x.leaky_relu(LRELU_SLOPE)
-      xt = c1(xt if x_mask is None else xt * x_mask).leaky_relu(LRELU_SLOPE)
-      x = c2(xt if x_mask is None else xt * x_mask) + x
-    return x if x_mask is None else x * x_mask
-
-class ResBlock2:
-  def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-    self.convs = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(2)]
-  def forward(self, x, x_mask=None):
-    for c in self.convs:
-      xt = x.leaky_relu(LRELU_SLOPE)
-      xt = c(xt if x_mask is None else xt * x_mask)
-      x = xt + x
-    return x if x_mask is None else x * x_mask
-
-class DDSConv: # Dilated and Depth-Separable Convolution
-  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
-    self.channels, self.kernel_size, self.n_layers, self.p_dropout = channels, kernel_size, n_layers, p_dropout
-    self.convs_sep, self.convs_1x1, self.norms_1, self.norms_2 = [], [], [], []
-    for i in range(n_layers):
-      dilation = kernel_size ** i
-      padding = (kernel_size * dilation - dilation) // 2
-      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding))
-      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
-      self.norms_1.append(LayerNorm(channels))
-      self.norms_2.append(LayerNorm(channels))
-  def forward(self, x, x_mask, g=None):
-    if g is not None: x = x + g
-    for i in range(self.n_layers):
-      y = self.convs_sep[i](x * x_mask)
-      y = self.norms_1[i].forward(y).gelu()
-      y = self.convs_1x1[i](y)
-      y = self.norms_2[i].forward(y).gelu()
-      x = x + y.dropout(self.p_dropout)
-    return x * x_mask
-
-class ConvFlow:
-  def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
-    self.in_channels, self.filter_channels, self.kernel_size, self.n_layers, self.num_bins, self.tail_bound = in_channels, filter_channels, kernel_size, n_layers, num_bins, tail_bound
-    self.half_channels = in_channels // 2
-    self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
-    self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = x.split([self.half_channels] * 2, 1)
-    h = self.proj(self.convs.forward(self.pre(x0), x_mask, g=g)) * x_mask
-    b, c, t = x0.shape
-    h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
-    un_normalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
-    un_normalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
-    un_normalized_derivatives = h[..., 2 * self.num_bins:]
-    x1, log_abs_det = piecewise_rational_quadratic_transform(x1, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=reverse, tails='linear', tail_bound=self.tail_bound)
-    x = x0.cat(x1, dim=1) * x_mask
-    return x if reverse else (x, Tensor.sum(log_abs_det * x_mask, [1,2]))
-
-class ResidualCouplingLayer:
-  def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
-    assert channels % 2 == 0, "channels should be divisible by 2"
-    self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.mean_only = channels, hidden_channels, kernel_size, dilation_rate, n_layers, mean_only
-    self.half_channels = channels // 2
-    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
-    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = x.split([self.half_channels] * 2, 1)
-    stats = self.post(self.enc.forward(self.pre(x0) * x_mask, x_mask, g=g)) * x_mask
-    if not self.mean_only:
-      m, logs = stats.split([self.half_channels] * 2, 1)
-    else:
-      m = stats
-      logs = Tensor.zeros_like(m)
-    if not reverse: return x0.cat((m + x1 * logs.exp() * x_mask), dim=1)
-    return x0.cat(((x1 - m) * (-logs).exp() * x_mask), dim=1)
-
-class Log:
-  def forward(self, x : Tensor, x_mask, reverse=False):
-    if not reverse:
-      y = x.maximum(1e-5).log() * x_mask
-      return y, (-y).sum([1, 2])
-    return x.exp() * x_mask
-
-class Flip:
-  def forward(self, x: Tensor, *args, reverse=False, **kwargs):
-    return x.flip([1]) if reverse else (x.flip([1]), Tensor.zeros(x.shape[0], dtype=x.dtype).to(device=x.device))
-
-class ElementwiseAffine:
-  def __init__(self, channels): self.m, self.logs = Tensor.zeros(channels, 1), Tensor.zeros(channels, 1)
-  def forward(self, x, x_mask, reverse=False, **kwargs): # x if reverse else y, logdet
-    return (x - self.m) * Tensor.exp(-self.logs) * x_mask if reverse \
-      else ((self.m + Tensor.exp(self.logs) * x) * x_mask, Tensor.sum(self.logs * x_mask, [1, 2]))
-
-class MultiHeadAttention:
-  def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
-    assert channels % n_heads == 0
-    self.channels, self.out_channels, self.n_heads, self.p_dropout, self.window_size, self.heads_share, self.block_length, self.proximal_bias, self.proximal_init = channels, out_channels, n_heads, p_dropout, window_size, heads_share, block_length, proximal_bias, proximal_init
-    self.attn, self.k_channels  = None, channels // n_heads
-    self.conv_q, self.conv_k, self.conv_v = [nn.Conv1d(channels, channels, 1) for _ in range(3)]
-    self.conv_o = nn.Conv1d(channels, out_channels, 1)
-    if window_size is not None: self.emb_rel_k, self.emb_rel_v = [Tensor.randn(1 if heads_share else n_heads, window_size * 2 + 1, self.k_channels) * (self.k_channels ** -0.5) for _ in range(2)]
-  def forward(self, x, c, attn_mask=None):
-    q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
-    x, self.attn = self.attention(q, k, v, mask=attn_mask)
-    return self.conv_o(x)
-  def attention(self, query: Tensor, key: Tensor, value: Tensor, mask=None):# reshape [b, d, t] -> [b, n_h, t, d_k]
-    b, d, t_s, t_t = key.shape[0], key.shape[1], key.shape[2], query.shape[2]
-    query = query.reshape(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-    key = key.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    value = value.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    scores = (query / math.sqrt(self.k_channels)) @ key.transpose(-2, -1)
-    if self.window_size is not None:
-      assert t_s == t_t, "Relative attention is only available for self-attention."
-      key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-      rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
-      scores = scores + self._relative_position_to_absolute_position(rel_logits)
-    if mask is not None:
-      scores = Tensor.where(mask, scores, -1e4)
-      if self.block_length is not None:
-        assert t_s == t_t, "Local attention is only available for self-attention."
-        scores = Tensor.where(Tensor.ones_like(scores).triu(-self.block_length).tril(self.block_length), scores, -1e4)
-    p_attn = scores.softmax(axis=-1)  # [b, n_h, t_t, t_s]
-    output = p_attn.matmul(value)
-    if self.window_size is not None:
-      relative_weights = self._absolute_position_to_relative_position(p_attn)
-      value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
-      output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-    output = output.transpose(2, 3).contiguous().reshape(b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-    return output, p_attn
-  def _matmul_with_relative_values(self, x, y): return x.matmul(y.unsqueeze(0))                 # x: [b, h, l, m], y: [h or 1, m, d], ret: [b, h, l, d]
-  def _matmul_with_relative_keys(self, x, y): return x.matmul(y.unsqueeze(0).transpose(-2, -1)) # x: [b, h, l, d], y: [h or 1, m, d], re, : [b, h, l, m]
-  def _get_relative_embeddings(self, relative_embeddings, length):
-    pad_length, slice_start_position = max(length - (self.window_size + 1), 0), max((self.window_size + 1) - length, 0)
-    padded_relative_embeddings = relative_embeddings if pad_length <= 0\
-      else relative_embeddings.pad(convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-    return padded_relative_embeddings[:, slice_start_position:(slice_start_position + 2 * length - 1)] #used_relative_embeddings
-  def _relative_position_to_absolute_position(self, x: Tensor): # x: [b, h, l, 2*l-1] -> [b, h, l, l]
-    batch, heads, length, _ = x.shape
-    x = x.pad(convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
-    x_flat = x.reshape([batch, heads, length * 2 * length]).pad(convert_pad_shape([[0,0],[0,0],[0,length-1]]))
-    return x_flat.reshape([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
-  def _absolute_position_to_relative_position(self, x: Tensor): # x: [b, h, l, l] -> [b, h, l, 2*l-1]
-    batch, heads, length, _ = x.shape
-    x = x.pad(convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
-    x_flat = x.reshape([batch, heads, length**2 + length*(length -1)]).pad(convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-    return x_flat.reshape([batch, heads, length, 2*length])[:,:,:,1:]
-
-class FFN:
-  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
-    self.in_channels, self.out_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.activation, self.causal = in_channels, out_channels, filter_channels, kernel_size, p_dropout, activation, causal
-    self.padding = self._causal_padding if causal else self._same_padding
-    self.conv_1, self.conv_2 = nn.Conv1d(in_channels, filter_channels, kernel_size), nn.Conv1d(filter_channels, out_channels, kernel_size)
-  def forward(self, x, x_mask):
-    x = self.conv_1(self.padding(x * x_mask))
-    x = x * (1.702 * x).sigmoid() if self.activation == "gelu" else x.relu()
-    return self.conv_2(self.padding(x.dropout(self.p_dropout) * x_mask)) * x_mask
-  def _causal_padding(self, x):return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [self.kernel_size - 1, 0]]))
-  def _same_padding(self, x): return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [(self.kernel_size - 1) // 2, self.kernel_size // 2]]))
-
-class Encoder:
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
-    self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.window_size = hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, window_size
-    self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 = [], [], [], []
-    for _ in range(n_layers):
-      self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask):
-    attn_mask, x = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1), x * x_mask
-    for i in range(self.n_layers):
-      y = self.attn_layers[i].forward(x, x, attn_mask).dropout(self.p_dropout)
-      x = self.norm_layers_1[i].forward(x + y)
-      y = self.ffn_layers[i].forward(x, x_mask).dropout(self.p_dropout)
-      x = self.norm_layers_2[i].forward(x + y)
-    return x * x_mask
-
-DEFAULT_MIN_BIN_WIDTH, DEFAULT_MIN_BIN_HEIGHT, DEFAULT_MIN_DERIVATIVE = 1e-3, 1e-3, 1e-3
-def piecewise_rational_quadratic_transform(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails=None, tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  if tails is None: spline_fn, spline_kwargs = rational_quadratic_spline, {}
-  else: spline_fn, spline_kwargs = unconstrained_rational_quadratic_spline, {'tails': tails, 'tail_bound': tail_bound}
-  return spline_fn(inputs=inputs, un_normalized_widths=un_normalized_widths, un_normalized_heights=un_normalized_heights, un_normalized_derivatives=un_normalized_derivatives, inverse=inverse, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative, **spline_kwargs)
-def unconstrained_rational_quadratic_spline(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails='linear', tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  if not tails == 'linear': raise RuntimeError('{} tails are not implemented.'.format(tails))
-  constant = np.log(np.exp(1 - min_derivative) - 1).item()
-  un_normalized_derivatives = cat_lr(un_normalized_derivatives, constant, constant)
-  output, log_abs_det = rational_quadratic_spline(inputs=inputs.squeeze(dim=0).squeeze(dim=0), unnormalized_widths=un_normalized_widths.squeeze(dim=0).squeeze(dim=0), unnormalized_heights=un_normalized_heights.squeeze(dim=0).squeeze(dim=0), unnormalized_derivatives=un_normalized_derivatives.squeeze(dim=0).squeeze(dim=0), inverse=inverse, left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative)
-  return output.unsqueeze(dim=0).unsqueeze(dim=0), log_abs_det.unsqueeze(dim=0).unsqueeze(dim=0)
-def rational_quadratic_spline(inputs: Tensor, unnormalized_widths: Tensor, unnormalized_heights: Tensor, unnormalized_derivatives: Tensor, inverse=False, left=0., right=1., bottom=0., top=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  num_bins = unnormalized_widths.shape[-1]
-  if min_bin_width * num_bins > 1.0: raise ValueError('Minimal bin width too large for the number of bins')
-  if min_bin_height * num_bins > 1.0: raise ValueError('Minimal bin height too large for the number of bins')
-  widths = min_bin_width + (1 - min_bin_width * num_bins) * unnormalized_widths.softmax(axis=-1)
-  cum_widths = cat_lr(((right - left) * widths[..., :-1].cumsum(axis=1) + left), left, right + 1e-6 if not inverse else right)
-  widths = cum_widths[..., 1:] - cum_widths[..., :-1]
-  derivatives = min_derivative + (unnormalized_derivatives.exp()+1).log()
-  heights = min_bin_height + (1 - min_bin_height * num_bins) * unnormalized_heights.softmax(axis=-1)
-  cum_heights = cat_lr(((top - bottom) * heights[..., :-1].cumsum(axis=1) + bottom), bottom, top + 1e-6 if inverse else top)
-  heights = cum_heights[..., 1:] - cum_heights[..., :-1]
-  bin_idx = ((inputs[..., None] >= (cum_heights if inverse else cum_widths)).sum(axis=-1) - 1)[..., None]
-  input_cum_widths = gather(cum_widths, bin_idx, axis=-1)[..., 0]
-  input_bin_widths = gather(widths, bin_idx, axis=-1)[..., 0]
-  input_cum_heights = gather(cum_heights, bin_idx, axis=-1)[..., 0]
-  input_delta = gather(heights / widths, bin_idx, axis=-1)[..., 0]
-  input_derivatives = gather(derivatives, bin_idx, axis=-1)[..., 0]
-  input_derivatives_plus_one = gather(derivatives[..., 1:], bin_idx, axis=-1)[..., 0]
-  input_heights = gather(heights, bin_idx, axis=-1)[..., 0]
-  if inverse:
-    a = ((inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + input_heights * (input_delta - input_derivatives))
-    b = (input_heights * input_derivatives - (inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta))
-    c = - input_delta * (inputs - input_cum_heights)
-    discriminant = b.square() - 4 * a * c
-    # assert (discriminant.numpy() >= 0).all()
-    root = (2 * c) / (-b - discriminant.sqrt())
-    theta_one_minus_theta = root * (1 - root)
-    denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
-    derivative_numerator = input_delta.square() * (input_derivatives_plus_one * root.square() + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - root).square())
-    return root * input_bin_widths + input_cum_widths, -(derivative_numerator.log() - 2 * denominator.log())
-  theta = (inputs - input_cum_widths) / input_bin_widths
-  theta_one_minus_theta = theta * (1 - theta)
-  numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
-  denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
-  derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - theta).pow(2))
-  return input_cum_heights + numerator / denominator, derivative_numerator.log() - 2 * denominator.log()
-
-def sequence_mask(length: Tensor, max_length): return Tensor.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1)
-def generate_path(duration: Tensor, mask: Tensor):  # duration: [b, 1, t_x], mask: [b, 1, t_y, t_x]
-  b, _, t_y, t_x = mask.shape
-  path = sequence_mask(duration.cumsum(axis=2).reshape(b * t_x), t_y).cast(mask.dtype).reshape(b, t_x, t_y)
-  path = path - path.pad(convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-  return path.unsqueeze(1).transpose(2, 3) * mask
-def fused_add_tanh_sigmoid_multiply(input_a: Tensor, input_b: Tensor, n_channels: int):
-  n_channels_int, in_act = n_channels, input_a + input_b
-  t_act, s_act = in_act[:, :n_channels_int, :].tanh(), in_act[:, n_channels_int:, :].sigmoid()
-  return t_act * s_act
-
-def cat_lr(t, left, right): return Tensor.full(get_shape(t), left).cat(t, dim=-1).cat(Tensor.full(get_shape(t), right), dim=-1)
-def get_shape(tensor):
-  (shape := list(tensor.shape))[-1] = 1
-  return tuple(shape)
-def convert_pad_shape(pad_shape): return tuple(tuple(x) for x in pad_shape)
-def get_padding(kernel_size, dilation=1): return int((kernel_size*dilation - dilation)/2)
-
-def gather(x, indices, axis):
-  indices = (indices < 0).where(indices + x.shape[axis], indices).transpose(0, axis)
-  permute_args = list(range(x.ndim))
-  permute_args[0], permute_args[axis] = permute_args[axis], permute_args[0]
-  permute_args.append(permute_args.pop(0))
-  x = x.permute(*permute_args)
-  reshape_arg = [1] * x.ndim + [x.shape[-1]]
-  return ((indices.unsqueeze(indices.ndim).expand(*indices.shape, x.shape[-1]) ==
-           Tensor.arange(x.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, x.shape[-1])) * x).sum(indices.ndim).transpose(0, axis)
-
-def norm_except_dim(v, dim):
-  if dim == -1: return np.linalg.norm(v)
-  if dim == 0:
-    (output_shape := [1] * v.ndim)[0] = v.shape[0]
-    return np.linalg.norm(v.reshape(v.shape[0], -1), axis=1).reshape(output_shape)
-  if dim == v.ndim - 1:
-    (output_shape := [1] * v.ndim)[-1] = v.shape[-1]
-    return np.linalg.norm(v.reshape(-1, v.shape[-1]), axis=0).reshape(output_shape)
-  transposed_v = np.transpose(v, (dim,) + tuple(i for i in range(v.ndim) if i != dim))
-  return np.transpose(norm_except_dim(transposed_v, 0), (dim,) + tuple(i for i in range(v.ndim) if i != dim))
-def weight_norm(v: Tensor, g: Tensor, dim):
-  v, g = v.numpy(), g.numpy()
-  return Tensor(v * (g / norm_except_dim(v, dim)))
-
-# HPARAMS LOADING
-def get_hparams_from_file(path):
-  with open(path, "r") as f:
-    data = f.read()
-  return HParams(**json.loads(data))
-class HParams:
-  def __init__(self, **kwargs):
-    for k, v in kwargs.items(): self[k] = v if type(v) != dict else HParams(**v)
-  def keys(self): return self.__dict__.keys()
-  def items(self): return self.__dict__.items()
-  def values(self): return self.__dict__.values()
-  def __len__(self): return len(self.__dict__)
-  def __getitem__(self, key): return getattr(self, key)
-  def __setitem__(self, key, value): return setattr(self, key, value)
-  def __contains__(self, key): return key in self.__dict__
-  def __repr__(self): return self.__dict__.__repr__()
-
-# MODEL LOADING
-def load_model(symbols, hps, model) -> Synthesizer:
-  net_g = Synthesizer(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers = hps.data.n_speakers, **hps.model)
-  _ = load_checkpoint(fetch(model[1]), net_g, None)
-  return net_g
-def load_checkpoint(checkpoint_path, model: Synthesizer, optimizer=None, skip_list=[]):
-  assert Path(checkpoint_path).is_file()
-  start_time = time.time()
-  checkpoint_dict = torch_load(checkpoint_path)
-  iteration, learning_rate = checkpoint_dict['iteration'], checkpoint_dict['learning_rate']
-  if optimizer: optimizer.load_state_dict(checkpoint_dict['optimizer'])
-  saved_state_dict = checkpoint_dict['model']
-  weight_g, weight_v, parent = None, None, None
-  for key, v in saved_state_dict.items():
-    if any(layer in key for layer in skip_list): continue
-    try:
-      obj, skip = model, False
-      for k in key.split('.'):
-        if k.isnumeric(): obj = obj[int(k)]
-        elif isinstance(obj, dict): obj = obj[k]
-        else:
-          if isinstance(obj, (LayerNorm, nn.LayerNorm)) and k in ["gamma", "beta"]:
-            k = "weight" if k == "gamma" else "bias"
-          elif k in ["weight_g", "weight_v"]:
-            parent, skip = obj, True
-            if k == "weight_g": weight_g = v
-            else: weight_v = v
-          if not skip: obj = getattr(obj, k)
-      if weight_g is not None and weight_v is not None:
-        setattr(obj, "weight_g", weight_g.numpy())
-        setattr(obj, "weight_v", weight_v.numpy())
-        obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
-        weight_g, weight_v, parent, skip = None, None, None, False
-      if not skip and obj.shape == v.shape: obj.assign(v.to(obj.device))
-      elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
-    except Exception as e: raise e
-  logging.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration}) in {time.time() - start_time:.4f}s")
-  return model, optimizer, learning_rate, iteration
-
-# Used for cleaning input text and mapping to symbols
-class TextMapper: # Based on https://github.com/keithito/tacotron
-  def __init__(self, symbols, apply_cleaners=True):
-    self.apply_cleaners, self.symbols, self._inflect = apply_cleaners, symbols, None
-    self._symbol_to_id, _id_to_symbol = {s: i for i, s in enumerate(symbols)}, {i: s for i, s in enumerate(symbols)}
-    self._whitespace_re, self._abbreviations = re.compile(r'\s+'), [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [('mrs', 'misess'), ('mr', 'mister'), ('dr', 'doctor'), ('st', 'saint'), ('co', 'company'), ('jr', 'junior'), ('maj', 'major'), ('gen', 'general'), ('drs', 'doctors'), ('rev', 'reverend'), ('lt', 'lieutenant'), ('hon', 'honorable'), ('sgt', 'sergeant'), ('capt', 'captain'), ('esq', 'esquire'), ('ltd', 'limited'), ('col', 'colonel'), ('ft', 'fort'), ]]
-    self.phonemizer = EspeakBackend(
-        language="en-us", punctuation_marks=Punctuation.default_marks(), preserve_punctuation=True, with_stress=True,
-    )
-  def text_to_sequence(self, text, cleaner_names):
-    if self.apply_cleaners:
-      for name in cleaner_names:
-        cleaner = getattr(self, name)
-        if not cleaner: raise ModuleNotFoundError('Unknown cleaner: %s' % name)
-        text = cleaner(text)
-    else: text = text.strip()
-    return [self._symbol_to_id[symbol] for symbol in text]
-  def get_text(self, text, add_blank=False, cleaners=('english_cleaners2',)):
-    text_norm = self.text_to_sequence(text, cleaners)
-    return Tensor(self.intersperse(text_norm, 0) if add_blank else text_norm, dtype=dtypes.int64)
-  def intersperse(self, lst, item):
-    (result := [item] * (len(lst) * 2 + 1))[1::2] = lst
-    return result
-  def phonemize(self, text, strip=True): return _phonemize(self.phonemizer, text, default_separator, strip, 1, False, False)
-  def filter_oov(self, text): return "".join(list(filter(lambda x: x in self._symbol_to_id, text)))
-  def base_english_cleaners(self, text): return self.collapse_whitespace(self.phonemize(self.expand_abbreviations(unidecode(text.lower()))))
-  def english_cleaners2(self, text): return self.base_english_cleaners(text)
-  def transliteration_cleaners(self, text): return self.collapse_whitespace(unidecode(text.lower()))
-  def cjke_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text).replace('ɑ', 'a').replace('ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')))
-  def cjke_cleaners2(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text)))
-  def cjks_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_lazy_ipa(text)))
-  def english_to_ipa2(self, text):
-    _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ ('r', 'ɹ'), ('ʤ', 'dʒ'), ('ʧ', 'tʃ')]]
-    return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _ipa_to_ipa2, self.mark_dark_l(self.english_to_ipa(text))).replace('...', '…')
-  def mark_dark_l(self, text): return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ' + x.group(1), text)
-  def english_to_ipa(self, text):
-    import eng_to_ipa as ipa
-    return self.collapse_whitespace(ipa.convert(self.normalize_numbers(self.expand_abbreviations(unidecode(text).lower()))))
-  def english_to_lazy_ipa(self, text):
-    _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [('r', 'ɹ'), ('æ', 'e'), ('ɑ', 'a'), ('ɔ', 'o'), ('ð', 'z'), ('θ', 's'), ('ɛ', 'e'), ('ɪ', 'i'), ('ʊ', 'u'), ('ʒ', 'ʥ'), ('ʤ', 'ʥ'), ('ˈ', '↓')]]
-    return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _lazy_ipa, self.english_to_ipa(text))
-  def expand_abbreviations(self, text): return reduce(lambda t, abbr: re.sub(abbr[0], abbr[1], t), self._abbreviations, text)
-  def collapse_whitespace(self, text): return re.sub(self._whitespace_re, ' ', text)
-  def normalize_numbers(self, text):
-    import inflect
-    self._inflect = inflect.engine()
-    text = re.sub(re.compile(r'([0-9][0-9\,]+[0-9])'), self._remove_commas, text)
-    text = re.sub(re.compile(r'£([0-9\,]*[0-9]+)'), r'\1 pounds', text)
-    text = re.sub(re.compile(r'\$([0-9\.\,]*[0-9]+)'), self._expand_dollars, text)
-    text = re.sub(re.compile(r'([0-9]+\.[0-9]+)'), self._expand_decimal_point, text)
-    text = re.sub(re.compile(r'[0-9]+(st|nd|rd|th)'), self._expand_ordinal, text)
-    text = re.sub(re.compile(r'[0-9]+'), self._expand_number, text)
-    return text
-  def _remove_commas(self, m): return m.group(1).replace(',', '') # george won't like this
-  def _expand_dollars(self, m):
-    match = m.group(1)
-    parts = match.split('.')
-    if len(parts) > 2: return match + ' dollars'  # Unexpected format
-    dollars, cents = int(parts[0]) if parts[0] else 0, int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents: return '%s %s, %s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars', cents, 'cent' if cents == 1 else 'cents')
-    if dollars: return '%s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars')
-    if cents: return '%s %s' % (cents, 'cent' if cents == 1 else 'cents')
-    return 'zero dollars'
-  def _expand_decimal_point(self, m): return m.group(1).replace('.', ' point ')
-  def _expand_ordinal(self, m): return self._inflect.number_to_words(m.group(0))
-  def _expand_number(self, _inflect, m):
-    num = int(m.group(0))
-    if 1000 < num < 3000:
-      if num == 2000: return 'two thousand'
-      if 2000 < num < 2010: return 'two thousand ' + self._inflect.number_to_words(num % 100)
-      if num % 100 == 0: return self._inflect.number_to_words(num // 100) + ' hundred'
-      return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
-    return self._inflect.number_to_words(num, andword='')
-
-#########################################################################################
-# PAPER: https://arxiv.org/abs/2106.06103
-# CODE: https://github.com/jaywalnut310/vits/tree/main
-#########################################################################################
-# INSTALLATION: this is based on default config, dependencies are for preprocessing.
-# vctk, ljs                      | pip3 install unidecode phonemizer          | phonemizer requires [eSpeak](https://espeak.sourceforge.net) backend to be installed on your system
-# mmts-tts                       | pip3 install unidecode                     |
-# uma_trilingual, cjks, voistock | pip3 install unidecode inflect eng_to_ipa  |
-#########################################################################################
-# Some good speakers to try out, there may be much better ones, I only tried out a few:
-# male vctk 1  | --model_to_use vctk --speaker_id 2
-# male vctk 2  | --model_to_use vctk --speaker_id 6
-# anime lady 1 | --model_to_use uma_trilingual --speaker_id 36
-# anime lady 2 | --model_to_use uma_trilingual --speaker_id 121
-#########################################################################################
-VITS_PATH = Path(__file__).parents[1] / "weights/VITS/"
-MODELS = { # config_url, weights_url
-  "ljs": ("https://raw.githubusercontent.com/jaywalnut310/vits/main/configs/ljs_base.json", "https://drive.google.com/uc?export=download&id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT&confirm=t"),
-  "vctk": ("https://huggingface.co/csukuangfj/vits-vctk/resolve/main/vctk_base.json", "https://huggingface.co/csukuangfj/vits-vctk/resolve/main/pretrained_vctk.pth"),
-  "mmts-tts": ("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/config.json", "https://huggingface.co/facebook/mms-tts/resolve/main/full_models/eng/G_100000.pth"),
-  "uma_trilingual": ("https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/raw/main/configs/uma_trilingual.json", "https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth"),
-  "cjks": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/model.pth"),
-  "voistock": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/model.pth"),
-}
-Y_LENGTH_ESTIMATE_SCALARS = {"ljs": 2.8, "vctk": 1.74, "mmts-tts": 1.9, "uma_trilingual": 2.3, "cjks": 3.3, "voistock": 3.1}
-if __name__ == '__main__':
-  logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-  parser = argparse.ArgumentParser()
-  parser.add_argument("--model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
-  parser.add_argument("--speaker_id", type=int, default=6, help="Specify the speaker ID. Default is 6.")
-  parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
-  parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
-  parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
-  parser.add_argument("--text_to_synthesize", default="""Hello person. If the code you are contributing isn't some of the highest quality code you've written in your life, either put in the effort to make it great, or don't bother.""", help="Specify the text to synthesize. Default is a greeting message.")
-  parser.add_argument("--noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
-  parser.add_argument("--noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
-  parser.add_argument("--length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
-  parser.add_argument("--seed", type=int, default=1337, help="Specify the seed (set to None if no seed). Default is 1337.")
-  parser.add_argument("--num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
-  parser.add_argument("--sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
-  parser.add_argument("--emotion_path", type=str, default=None, help="Specify the path to emotion reference.")
-  parser.add_argument("--estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
-  args = parser.parse_args()
-
-  model_config = MODELS[args.model_to_use]
-
-  # Load the hyperparameters from the config file.
-  hps = get_hparams_from_file(fetch(model_config[0]))
-
-  # If model has multiple speakers, validate speaker id and retrieve name if available.
-  model_has_multiple_speakers = hps.data.n_speakers > 0
-  if model_has_multiple_speakers:
-    logging.info(f"Model has {hps.data.n_speakers} speakers")
-    if args.speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {args.speaker_id} is invalid for this model.")
-    speaker_name = "?"
-    if hps.__contains__("speakers"): # maps speaker ids to names
-      speakers = hps.speakers
-      if isinstance(speakers, List): speakers = {speaker: i for i, speaker in enumerate(speakers)}
-      speaker_name = next((key for key, value in speakers.items() if value == args.speaker_id), None)
-    logging.info(f"You selected speaker {args.speaker_id} (name: {speaker_name})")
-
-  # Load emotions if any. TODO: find an english model with emotions, this is untested atm.
-  emotion_embedding = None
-  if args.emotion_path is not None:
-    if args.emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(args.emotion_path), dtype=dtypes.int64).unsqueeze(0)
-    else: raise ValueError("Emotion path must be a .npy file.")
-
-  # Load symbols, instantiate TextMapper and clean the text.
-  if hps.__contains__("symbols"): symbols = hps.symbols
-  elif args.model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
-  else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ")
-  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
-
-  # Load the model.
-  if args.seed is not None:
-    Tensor.manual_seed(args.seed)
-    np.random.seed(args.seed)
-  net_g = load_model(text_mapper.symbols, hps, model_config)
-  logging.debug(f"Loaded model with hps: {hps}")
-
-  # Convert the input text to a tensor.
-  text_to_synthesize = args.text_to_synthesize
-  if args.model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
-  stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
-  logging.debug(f"Converted input text to tensor \"{text_to_synthesize}\" -> Tensor({stn_tst.shape}): {stn_tst.numpy()}")
-  x_tst, x_tst_lengths = stn_tst.unsqueeze(0), Tensor([stn_tst.shape[0]], dtype=dtypes.int64)
-  sid = Tensor([args.speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
-
-  # Perform inference.
-  start_time = time.time()
-  audio_tensor = net_g.infer(x_tst, x_tst_lengths, sid, args.noise_scale, args.length_scale, args.noise_scale_w, emotion_embedding=emotion_embedding,
-                             max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[args.model_to_use] if args.estimate_max_y_length else None)[0, 0].realize()
-  logging.info(f"Inference took {(time.time() - start_time):.2f}s")
-
-  # Save the audio output.
-  audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
-  out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model_to_use}{f'_sid_{args.speaker_id}' if model_has_multiple_speakers else ''}_{args.base_name}.wav")
-  out_path.parent.mkdir(parents=True, exist_ok=True)
-  with wave.open(str(out_path), 'wb') as wav_file:
-    wav_file.setnchannels(args.num_channels)
-    wav_file.setsampwidth(args.sample_width)
-    wav_file.setframerate(hps.data.sampling_rate)
-    wav_file.setnframes(len(audio_data))
-    wav_file.writeframes(audio_data.tobytes())
-  logging.info(f"Saved audio output to {out_path}")
--- a/extra/datasets/coco.py
+++ b/extra/datasets/coco.py
@@ -1,199 +0,0 @@
-import json
-import pathlib
-import zipfile
-import numpy as np
-from tinygrad.helpers import fetch
-import pycocotools._mask as _mask
-from examples.mask_rcnn import Masker
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-
-iou         = _mask.iou
-merge       = _mask.merge
-frPyObjects = _mask.frPyObjects
-
-BASEDIR = pathlib.Path(__file__).parent / "COCO"
-BASEDIR.mkdir(exist_ok=True)
-
-def create_dict(key_row, val_row, rows): return {row[key_row]:row[val_row] for row in rows}
-
-
-if not pathlib.Path(BASEDIR/'val2017').is_dir():
-  fn = fetch('http://images.cocodataset.org/zips/val2017.zip')
-  with zipfile.ZipFile(fn, 'r') as zip_ref:
-    zip_ref.extractall(BASEDIR)
-  fn.unlink()
-
-
-if not pathlib.Path(BASEDIR/'annotations').is_dir():
-  fn = fetch('http://images.cocodataset.org/annotations/annotations_trainval2017.zip')
-  with zipfile.ZipFile(fn, 'r') as zip_ref:
-    zip_ref.extractall(BASEDIR)
-  fn.unlink()
-
-with open(BASEDIR/'annotations/instances_val2017.json', 'r') as f:
-  annotations_raw = json.loads(f.read())
-images = annotations_raw['images']
-categories = annotations_raw['categories']
-annotations = annotations_raw['annotations']
-file_name_to_id = create_dict('file_name', 'id', images)
-id_to_width = create_dict('id', 'width', images)
-id_to_height = create_dict('id', 'height', images)
-json_category_id_to_contiguous_id = {v['id']: i + 1 for i, v in enumerate(categories)}
-contiguous_category_id_to_json_id = {v:k for k,v in json_category_id_to_contiguous_id.items()}
-
-
-def encode(bimask):
-  if len(bimask.shape) == 3:
-    return _mask.encode(bimask)
-  elif len(bimask.shape) == 2:
-    h, w = bimask.shape
-    return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
-
-def decode(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.decode(rleObjs)
-  else:
-    return _mask.decode([rleObjs])[:,:,0]
-
-def area(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.area(rleObjs)
-  else:
-    return _mask.area([rleObjs])[0]
-
-def toBbox(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.toBbox(rleObjs)
-  else:
-    return _mask.toBbox([rleObjs])[0]
-
-
-def convert_prediction_to_coco_bbox(file_name, prediction):
-  coco_results = []
-  try:
-    original_id = file_name_to_id[file_name]
-    if len(prediction) == 0:
-      return coco_results
-
-    image_width = id_to_width[original_id]
-    image_height = id_to_height[original_id]
-    prediction = prediction.resize((image_width, image_height))
-    prediction = prediction.convert("xywh")
-
-    boxes = prediction.bbox.numpy().tolist()
-    scores = prediction.get_field("scores").numpy().tolist()
-    labels = prediction.get_field("labels").numpy().tolist()
-
-    mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
-
-    coco_results.extend(
-      [
-        {
-          "image_id": original_id,
-          "category_id": mapped_labels[k],
-          "bbox": box,
-          "score": scores[k],
-        }
-          for k, box in enumerate(boxes)
-      ]
-    )
-  except Exception as e:
-    print(file_name, e)
-  return coco_results
-
-masker = Masker(threshold=0.5, padding=1)
-
-def convert_prediction_to_coco_mask(file_name, prediction):
-  coco_results = []
-  try:
-    original_id = file_name_to_id[file_name]
-    if len(prediction) == 0:
-      return coco_results
-
-    image_width = id_to_width[original_id]
-    image_height = id_to_height[original_id]
-    prediction = prediction.resize((image_width, image_height))
-    masks = prediction.get_field("mask")
-
-    scores = prediction.get_field("scores").numpy().tolist()
-    labels = prediction.get_field("labels").numpy().tolist()
-
-    masks = masker([masks], [prediction])[0].numpy()
-
-    rles = [
-      encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0]
-      for mask in masks
-    ]
-    for rle in rles:
-      rle["counts"] = rle["counts"].decode("utf-8")
-
-    mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
-
-    coco_results.extend(
-      [
-        {
-          "image_id": original_id,
-          "category_id": mapped_labels[k],
-          "segmentation": rle,
-          "score": scores[k],
-        }
-          for k, rle in enumerate(rles)
-      ]
-    )
-  except Exception as e:
-    print(file_name, e)
-  return coco_results
-
-
-
-def accumulate_predictions_for_coco(coco_results, json_result_file, rm=False):
-  path = pathlib.Path(json_result_file)
-  if rm and path.exists(): path.unlink()
-  with open(path, "a") as f:
-    for s in coco_results:
-      f.write(json.dumps(s))
-      f.write('\n')
-
-def remove_dup(l):
-  seen = set()
-  seen_add = seen.add
-  return [x for x in l if not (x in seen or seen_add(x))]
-
-class NpEncoder(json.JSONEncoder):
-  def default(self, obj):
-    if isinstance(obj, np.integer):
-      return int(obj)
-    if isinstance(obj, np.floating):
-      return float(obj)
-    if isinstance(obj, np.ndarray):
-      return obj.tolist()
-    return super(NpEncoder, self).default(obj)
-
-
-def evaluate_predictions_on_coco(json_result_file, iou_type="bbox"):
-  coco_results = []
-  with open(json_result_file, "r") as f:
-    for line in f:
-      coco_results.append(json.loads(line))
-
-  coco_gt = COCO(str(BASEDIR/'annotations/instances_val2017.json'))
-  set_of_json = remove_dup([json.dumps(d, cls=NpEncoder) for d in coco_results])
-  unique_list = [json.loads(s) for s in set_of_json]
-
-  with open(f'{json_result_file}.flattend', "w") as f:
-    json.dump(unique_list, f)
-
-  coco_dt = coco_gt.loadRes(str(f'{json_result_file}.flattend'))
-  coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
-  coco_eval.evaluate()
-  coco_eval.accumulate()
-  coco_eval.summarize()
-  return coco_eval
-
-def iterate(files, bs=1):
-  batch = []
-  for file in files:
-    batch.append(file)
-    if len(batch) >= bs: yield batch; batch = []
-  if len(batch) > 0: yield batch; batch = []
--- a/extra/disassemblers/adreno/.gitignore
+++ b/extra/disassemblers/adreno/.gitignore
@@ -1 +0,0 @@
-disasm.so
--- a/extra/disassemblers/adreno/README
+++ b/extra/disassemblers/adreno/README
@@ -1,5 +0,0 @@
-From the Freedreno project
-
-https://gallium.readthedocs.io/en/latest/gallium/drivers/freedreno.html
-
-In Mesa3D, so licensed MIT.
--- a/extra/disassemblers/adreno/disasm-a3xx.c
+++ b/extra/disassemblers/adreno/disasm-a3xx.c
--- a/extra/disassemblers/adreno/instr-a3xx.h
+++ b/extra/disassemblers/adreno/instr-a3xx.h
--- a/extra/disassemblers/adreno/ir3.h
+++ b/extra/disassemblers/adreno/ir3.h
--- a/extra/disassemblers/adreno/shader_enums.h
+++ b/extra/disassemblers/adreno/shader_enums.h
@@ -1,906 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef SHADER_ENUMS_H
-#define SHADER_ENUMS_H
-
-#include <stdbool.h>
-
-/* Project-wide (GL and Vulkan) maximum. */
-#define MAX_DRAW_BUFFERS 8
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Shader stages.
- *
- * The order must match how shaders are ordered in the pipeline.
- * The GLSL linker assumes that if i<j, then the j-th shader is
- * executed later than the i-th shader.
- */
-typedef enum
-{
-   MESA_SHADER_NONE = -1,
-   MESA_SHADER_VERTEX = 0,
-   MESA_SHADER_TESS_CTRL = 1,
-   MESA_SHADER_TESS_EVAL = 2,
-   MESA_SHADER_GEOMETRY = 3,
-   MESA_SHADER_FRAGMENT = 4,
-   MESA_SHADER_COMPUTE = 5,
-   /* must be last so it doesn't affect the GL pipeline */
-   MESA_SHADER_KERNEL = 6,
-} gl_shader_stage;
-
-static inline bool
-gl_shader_stage_is_compute(gl_shader_stage stage)
-{
-   return stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL;
-}
-
-/**
- * Number of STATE_* values we need to address any GL state.
- * Used to dimension arrays.
- */
-#define STATE_LENGTH 5
-
-typedef short gl_state_index16; /* see enum gl_state_index */
-
-const char *gl_shader_stage_name(gl_shader_stage stage);
-
-/**
- * Translate a gl_shader_stage to a short shader stage name for debug
- * printouts and error messages.
- */
-const char *_mesa_shader_stage_to_string(unsigned stage);
-
-/**
- * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
- * for debug printouts and error messages.
- */
-const char *_mesa_shader_stage_to_abbrev(unsigned stage);
-
-/**
- * GL related stages (not including CL)
- */
-#define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
-
-/**
- * All stages
- */
-#define MESA_ALL_SHADER_STAGES (MESA_SHADER_KERNEL + 1)
-
-
-/**
- * Indexes for vertex program attributes.
- * GL_NV_vertex_program aliases generic attributes over the conventional
- * attributes.  In GL_ARB_vertex_program shader the aliasing is optional.
- * In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
- * generic attributes are distinct/separate).
- */
-typedef enum
-{
-   VERT_ATTRIB_POS,
-   VERT_ATTRIB_NORMAL,
-   VERT_ATTRIB_COLOR0,
-   VERT_ATTRIB_COLOR1,
-   VERT_ATTRIB_FOG,
-   VERT_ATTRIB_COLOR_INDEX,
-   VERT_ATTRIB_EDGEFLAG,
-   VERT_ATTRIB_TEX0,
-   VERT_ATTRIB_TEX1,
-   VERT_ATTRIB_TEX2,
-   VERT_ATTRIB_TEX3,
-   VERT_ATTRIB_TEX4,
-   VERT_ATTRIB_TEX5,
-   VERT_ATTRIB_TEX6,
-   VERT_ATTRIB_TEX7,
-   VERT_ATTRIB_POINT_SIZE,
-   VERT_ATTRIB_GENERIC0,
-   VERT_ATTRIB_GENERIC1,
-   VERT_ATTRIB_GENERIC2,
-   VERT_ATTRIB_GENERIC3,
-   VERT_ATTRIB_GENERIC4,
-   VERT_ATTRIB_GENERIC5,
-   VERT_ATTRIB_GENERIC6,
-   VERT_ATTRIB_GENERIC7,
-   VERT_ATTRIB_GENERIC8,
-   VERT_ATTRIB_GENERIC9,
-   VERT_ATTRIB_GENERIC10,
-   VERT_ATTRIB_GENERIC11,
-   VERT_ATTRIB_GENERIC12,
-   VERT_ATTRIB_GENERIC13,
-   VERT_ATTRIB_GENERIC14,
-   VERT_ATTRIB_GENERIC15,
-   VERT_ATTRIB_MAX
-} gl_vert_attrib;
-
-const char *gl_vert_attrib_name(gl_vert_attrib attrib);
-
-/**
- * Symbolic constats to help iterating over
- * specific blocks of vertex attributes.
- *
- * VERT_ATTRIB_FF
- *   includes all fixed function attributes as well as
- *   the aliased GL_NV_vertex_program shader attributes.
- * VERT_ATTRIB_TEX
- *   include the classic texture coordinate attributes.
- *   Is a subset of VERT_ATTRIB_FF.
- * VERT_ATTRIB_GENERIC
- *   include the OpenGL 2.0+ GLSL generic shader attributes.
- *   These alias the generic GL_ARB_vertex_shader attributes.
- * VERT_ATTRIB_MAT
- *   include the generic shader attributes used to alias
- *   varying material values for the TNL shader programs.
- *   They are located at the end of the generic attribute
- *   block not to overlap with the generic 0 attribute.
- */
-#define VERT_ATTRIB_FF(i)           (VERT_ATTRIB_POS + (i))
-#define VERT_ATTRIB_FF_MAX          VERT_ATTRIB_GENERIC0
-
-#define VERT_ATTRIB_TEX(i)          (VERT_ATTRIB_TEX0 + (i))
-#define VERT_ATTRIB_TEX_MAX         MAX_TEXTURE_COORD_UNITS
-
-#define VERT_ATTRIB_GENERIC(i)      (VERT_ATTRIB_GENERIC0 + (i))
-#define VERT_ATTRIB_GENERIC_MAX     MAX_VERTEX_GENERIC_ATTRIBS
-
-#define VERT_ATTRIB_MAT0            \
-   (VERT_ATTRIB_GENERIC_MAX - VERT_ATTRIB_MAT_MAX)
-#define VERT_ATTRIB_MAT(i)          \
-   VERT_ATTRIB_GENERIC((i) + VERT_ATTRIB_MAT0)
-#define VERT_ATTRIB_MAT_MAX         MAT_ATTRIB_MAX
-
-/**
- * Bitflags for vertex attributes.
- * These are used in bitfields in many places.
- */
-/*@{*/
-#define VERT_BIT_POS             BITFIELD_BIT(VERT_ATTRIB_POS)
-#define VERT_BIT_NORMAL          BITFIELD_BIT(VERT_ATTRIB_NORMAL)
-#define VERT_BIT_COLOR0          BITFIELD_BIT(VERT_ATTRIB_COLOR0)
-#define VERT_BIT_COLOR1          BITFIELD_BIT(VERT_ATTRIB_COLOR1)
-#define VERT_BIT_FOG             BITFIELD_BIT(VERT_ATTRIB_FOG)
-#define VERT_BIT_COLOR_INDEX     BITFIELD_BIT(VERT_ATTRIB_COLOR_INDEX)
-#define VERT_BIT_EDGEFLAG        BITFIELD_BIT(VERT_ATTRIB_EDGEFLAG)
-#define VERT_BIT_TEX0            BITFIELD_BIT(VERT_ATTRIB_TEX0)
-#define VERT_BIT_TEX1            BITFIELD_BIT(VERT_ATTRIB_TEX1)
-#define VERT_BIT_TEX2            BITFIELD_BIT(VERT_ATTRIB_TEX2)
-#define VERT_BIT_TEX3            BITFIELD_BIT(VERT_ATTRIB_TEX3)
-#define VERT_BIT_TEX4            BITFIELD_BIT(VERT_ATTRIB_TEX4)
-#define VERT_BIT_TEX5            BITFIELD_BIT(VERT_ATTRIB_TEX5)
-#define VERT_BIT_TEX6            BITFIELD_BIT(VERT_ATTRIB_TEX6)
-#define VERT_BIT_TEX7            BITFIELD_BIT(VERT_ATTRIB_TEX7)
-#define VERT_BIT_POINT_SIZE      BITFIELD_BIT(VERT_ATTRIB_POINT_SIZE)
-#define VERT_BIT_GENERIC0        BITFIELD_BIT(VERT_ATTRIB_GENERIC0)
-
-#define VERT_BIT(i)              BITFIELD_BIT(i)
-#define VERT_BIT_ALL             BITFIELD_RANGE(0, VERT_ATTRIB_MAX)
-
-#define VERT_BIT_FF(i)           VERT_BIT(i)
-#define VERT_BIT_FF_ALL          BITFIELD_RANGE(0, VERT_ATTRIB_FF_MAX)
-#define VERT_BIT_TEX(i)          VERT_BIT(VERT_ATTRIB_TEX(i))
-#define VERT_BIT_TEX_ALL         \
-   BITFIELD_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)
-
-#define VERT_BIT_GENERIC(i)      VERT_BIT(VERT_ATTRIB_GENERIC(i))
-#define VERT_BIT_GENERIC_ALL     \
-   BITFIELD_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
-
-#define VERT_BIT_MAT(i)	         VERT_BIT(VERT_ATTRIB_MAT(i))
-#define VERT_BIT_MAT_ALL         \
-   BITFIELD_RANGE(VERT_ATTRIB_MAT(0), VERT_ATTRIB_MAT_MAX)
-/*@}*/
-
-#define MAX_VARYING 32 /**< number of float[4] vectors */
-
-/**
- * Indexes for vertex shader outputs, geometry shader inputs/outputs, and
- * fragment shader inputs.
- *
- * Note that some of these values are not available to all pipeline stages.
- *
- * When this enum is updated, the following code must be updated too:
- * - vertResults (in prog_print.c's arb_output_attrib_string())
- * - fragAttribs (in prog_print.c's arb_input_attrib_string())
- * - _mesa_varying_slot_in_fs()
- */
-typedef enum
-{
-   VARYING_SLOT_POS,
-   VARYING_SLOT_COL0, /* COL0 and COL1 must be contiguous */
-   VARYING_SLOT_COL1,
-   VARYING_SLOT_FOGC,
-   VARYING_SLOT_TEX0, /* TEX0-TEX7 must be contiguous */
-   VARYING_SLOT_TEX1,
-   VARYING_SLOT_TEX2,
-   VARYING_SLOT_TEX3,
-   VARYING_SLOT_TEX4,
-   VARYING_SLOT_TEX5,
-   VARYING_SLOT_TEX6,
-   VARYING_SLOT_TEX7,
-   VARYING_SLOT_PSIZ, /* Does not appear in FS */
-   VARYING_SLOT_BFC0, /* Does not appear in FS */
-   VARYING_SLOT_BFC1, /* Does not appear in FS */
-   VARYING_SLOT_EDGE, /* Does not appear in FS */
-   VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */
-   VARYING_SLOT_CLIP_DIST0,
-   VARYING_SLOT_CLIP_DIST1,
-   VARYING_SLOT_CULL_DIST0,
-   VARYING_SLOT_CULL_DIST1,
-   VARYING_SLOT_PRIMITIVE_ID, /* Does not appear in VS */
-   VARYING_SLOT_LAYER, /* Appears as VS or GS output */
-   VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
-   VARYING_SLOT_FACE, /* FS only */
-   VARYING_SLOT_PNTC, /* FS only */
-   VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
-   VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
-   VARYING_SLOT_BOUNDING_BOX0, /* Only appears as TCS output. */
-   VARYING_SLOT_BOUNDING_BOX1, /* Only appears as TCS output. */
-   VARYING_SLOT_VIEW_INDEX,
-   VARYING_SLOT_VIEWPORT_MASK, /* Does not appear in FS */
-   VARYING_SLOT_VAR0, /* First generic varying slot */
-   /* the remaining are simply for the benefit of gl_varying_slot_name()
-    * and not to be construed as an upper bound:
-    */
-   VARYING_SLOT_VAR1,
-   VARYING_SLOT_VAR2,
-   VARYING_SLOT_VAR3,
-   VARYING_SLOT_VAR4,
-   VARYING_SLOT_VAR5,
-   VARYING_SLOT_VAR6,
-   VARYING_SLOT_VAR7,
-   VARYING_SLOT_VAR8,
-   VARYING_SLOT_VAR9,
-   VARYING_SLOT_VAR10,
-   VARYING_SLOT_VAR11,
-   VARYING_SLOT_VAR12,
-   VARYING_SLOT_VAR13,
-   VARYING_SLOT_VAR14,
-   VARYING_SLOT_VAR15,
-   VARYING_SLOT_VAR16,
-   VARYING_SLOT_VAR17,
-   VARYING_SLOT_VAR18,
-   VARYING_SLOT_VAR19,
-   VARYING_SLOT_VAR20,
-   VARYING_SLOT_VAR21,
-   VARYING_SLOT_VAR22,
-   VARYING_SLOT_VAR23,
-   VARYING_SLOT_VAR24,
-   VARYING_SLOT_VAR25,
-   VARYING_SLOT_VAR26,
-   VARYING_SLOT_VAR27,
-   VARYING_SLOT_VAR28,
-   VARYING_SLOT_VAR29,
-   VARYING_SLOT_VAR30,
-   VARYING_SLOT_VAR31,
-} gl_varying_slot;
-
-
-#define VARYING_SLOT_MAX	(VARYING_SLOT_VAR0 + MAX_VARYING)
-#define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
-#define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
-#define MAX_VARYINGS_INCL_PATCH (VARYING_SLOT_TESS_MAX - VARYING_SLOT_VAR0)
-
-const char *gl_varying_slot_name(gl_varying_slot slot);
-
-/**
- * Bitflags for varying slots.
- */
-/*@{*/
-#define VARYING_BIT_POS BITFIELD64_BIT(VARYING_SLOT_POS)
-#define VARYING_BIT_COL0 BITFIELD64_BIT(VARYING_SLOT_COL0)
-#define VARYING_BIT_COL1 BITFIELD64_BIT(VARYING_SLOT_COL1)
-#define VARYING_BIT_FOGC BITFIELD64_BIT(VARYING_SLOT_FOGC)
-#define VARYING_BIT_TEX0 BITFIELD64_BIT(VARYING_SLOT_TEX0)
-#define VARYING_BIT_TEX1 BITFIELD64_BIT(VARYING_SLOT_TEX1)
-#define VARYING_BIT_TEX2 BITFIELD64_BIT(VARYING_SLOT_TEX2)
-#define VARYING_BIT_TEX3 BITFIELD64_BIT(VARYING_SLOT_TEX3)
-#define VARYING_BIT_TEX4 BITFIELD64_BIT(VARYING_SLOT_TEX4)
-#define VARYING_BIT_TEX5 BITFIELD64_BIT(VARYING_SLOT_TEX5)
-#define VARYING_BIT_TEX6 BITFIELD64_BIT(VARYING_SLOT_TEX6)
-#define VARYING_BIT_TEX7 BITFIELD64_BIT(VARYING_SLOT_TEX7)
-#define VARYING_BIT_TEX(U) BITFIELD64_BIT(VARYING_SLOT_TEX0 + (U))
-#define VARYING_BITS_TEX_ANY BITFIELD64_RANGE(VARYING_SLOT_TEX0, \
-                                              MAX_TEXTURE_COORD_UNITS)
-#define VARYING_BIT_PSIZ BITFIELD64_BIT(VARYING_SLOT_PSIZ)
-#define VARYING_BIT_BFC0 BITFIELD64_BIT(VARYING_SLOT_BFC0)
-#define VARYING_BIT_BFC1 BITFIELD64_BIT(VARYING_SLOT_BFC1)
-#define VARYING_BITS_COLOR (VARYING_BIT_COL0 | \
-                            VARYING_BIT_COL1 |        \
-                            VARYING_BIT_BFC0 |        \
-                            VARYING_BIT_BFC1)
-#define VARYING_BIT_EDGE BITFIELD64_BIT(VARYING_SLOT_EDGE)
-#define VARYING_BIT_CLIP_VERTEX BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX)
-#define VARYING_BIT_CLIP_DIST0 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)
-#define VARYING_BIT_CLIP_DIST1 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)
-#define VARYING_BIT_CULL_DIST0 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)
-#define VARYING_BIT_CULL_DIST1 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)
-#define VARYING_BIT_PRIMITIVE_ID BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID)
-#define VARYING_BIT_LAYER BITFIELD64_BIT(VARYING_SLOT_LAYER)
-#define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
-#define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
-#define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
-#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
-#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
-#define VARYING_BIT_BOUNDING_BOX0 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX0)
-#define VARYING_BIT_BOUNDING_BOX1 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX1)
-#define VARYING_BIT_VIEWPORT_MASK BITFIELD64_BIT(VARYING_SLOT_VIEWPORT_MASK)
-#define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
-/*@}*/
-
-/**
- * Bitflags for system values.
- */
-#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
-#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
-#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
-#define SYSTEM_BIT_LOCAL_INVOCATION_ID ((uint64_t)1 << SYSTEM_VALUE_LOCAL_INVOCATION_ID)
-
-/**
- * If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
- * one of these values.  If a NIR variable's mode is nir_var_system_value, it
- * will be one of these values.
- */
-typedef enum
-{
-   /**
-    * \name System values applicable to all shaders
-    */
-   /*@{*/
-
-   /**
-    * Builtin variables added by GL_ARB_shader_ballot.
-    */
-   /*@{*/
-
-   /**
-    * From the GL_ARB_shader-ballot spec:
-    *
-    *    "A sub-group is a collection of invocations which execute in lockstep.
-    *     The variable <gl_SubGroupSizeARB> is the maximum number of
-    *     invocations in a sub-group. The maximum <gl_SubGroupSizeARB>
-    *     supported in this extension is 64."
-    *
-    * The spec defines this as a uniform. However, it's highly unlikely that
-    * implementations actually treat it as a uniform (which is loaded from a
-    * constant buffer). Most likely, this is an implementation-wide constant,
-    * or perhaps something that depends on the shader stage.
-    */
-   SYSTEM_VALUE_SUBGROUP_SIZE,
-
-   /**
-    * From the GL_ARB_shader_ballot spec:
-    *
-    *    "The variable <gl_SubGroupInvocationARB> holds the index of the
-    *     invocation within sub-group. This variable is in the range 0 to
-    *     <gl_SubGroupSizeARB>-1, where <gl_SubGroupSizeARB> is the total
-    *     number of invocations in a sub-group."
-    */
-   SYSTEM_VALUE_SUBGROUP_INVOCATION,
-
-   /**
-    * From the GL_ARB_shader_ballot spec:
-    *
-    *    "The <gl_SubGroup??MaskARB> variables provide a bitmask for all
-    *     invocations, with one bit per invocation starting with the least
-    *     significant bit, according to the following table,
-    *
-    *       variable               equation for bit values
-    *       --------------------   ------------------------------------
-    *       gl_SubGroupEqMaskARB   bit index == gl_SubGroupInvocationARB
-    *       gl_SubGroupGeMaskARB   bit index >= gl_SubGroupInvocationARB
-    *       gl_SubGroupGtMaskARB   bit index >  gl_SubGroupInvocationARB
-    *       gl_SubGroupLeMaskARB   bit index <= gl_SubGroupInvocationARB
-    *       gl_SubGroupLtMaskARB   bit index <  gl_SubGroupInvocationARB
-    */
-   SYSTEM_VALUE_SUBGROUP_EQ_MASK,
-   SYSTEM_VALUE_SUBGROUP_GE_MASK,
-   SYSTEM_VALUE_SUBGROUP_GT_MASK,
-   SYSTEM_VALUE_SUBGROUP_LE_MASK,
-   SYSTEM_VALUE_SUBGROUP_LT_MASK,
-   /*@}*/
-
-   /**
-    * Builtin variables added by VK_KHR_subgroups
-    */
-   /*@{*/
-   SYSTEM_VALUE_NUM_SUBGROUPS,
-   SYSTEM_VALUE_SUBGROUP_ID,
-   /*@}*/
-
-   /*@}*/
-
-   /**
-    * \name Vertex shader system values
-    */
-   /*@{*/
-   /**
-    * OpenGL-style vertex ID.
-    *
-    * Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
-    * OpenGL 3.3 core profile spec says:
-    *
-    *     "gl_VertexID holds the integer index i implicitly passed by
-    *     DrawArrays or one of the other drawing commands defined in section
-    *     2.8.3."
-    *
-    * Section 2.8.3 (Drawing Commands) of the same spec says:
-    *
-    *     "The commands....are equivalent to the commands with the same base
-    *     name (without the BaseVertex suffix), except that the ith element
-    *     transferred by the corresponding draw call will be taken from
-    *     element indices[i] + basevertex of each enabled array."
-    *
-    * Additionally, the overview in the GL_ARB_shader_draw_parameters spec
-    * says:
-    *
-    *     "In unextended GL, vertex shaders have inputs named gl_VertexID and
-    *     gl_InstanceID, which contain, respectively the index of the vertex
-    *     and instance. The value of gl_VertexID is the implicitly passed
-    *     index of the vertex being processed, which includes the value of
-    *     baseVertex, for those commands that accept it."
-    *
-    * gl_VertexID gets basevertex added in.  This differs from DirectX where
-    * SV_VertexID does \b not get basevertex added in.
-    *
-    * \note
-    * If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
-    * equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
-    * \c SYSTEM_VALUE_BASE_VERTEX.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID,
-
-   /**
-    * Instanced ID as supplied to gl_InstanceID
-    *
-    * Values assigned to gl_InstanceID always begin with zero, regardless of
-    * the value of baseinstance.
-    *
-    * Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
-    * says:
-    *
-    *     "gl_InstanceID holds the integer instance number of the current
-    *     primitive in an instanced draw call (see section 10.5)."
-    *
-    * Through a big chain of pseudocode, section 10.5 describes that
-    * baseinstance is not counted by gl_InstanceID.  In that section, notice
-    *
-    *     "If an enabled vertex attribute array is instanced (it has a
-    *     non-zero divisor as specified by VertexAttribDivisor), the element
-    *     index that is transferred to the GL, for all vertices, is given by
-    *
-    *         floor(instance/divisor) + baseinstance
-    *
-    *     If an array corresponding to an attribute required by a vertex
-    *     shader is not enabled, then the corresponding element is taken from
-    *     the current attribute state (see section 10.2)."
-    *
-    * Note that baseinstance is \b not included in the value of instance.
-    */
-   SYSTEM_VALUE_INSTANCE_ID,
-
-   /**
-    * Vulkan InstanceIndex.
-    *
-    * InstanceIndex = gl_InstanceID + gl_BaseInstance
-    */
-   SYSTEM_VALUE_INSTANCE_INDEX,
-
-   /**
-    * DirectX-style vertex ID.
-    *
-    * Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
-    * the value of basevertex.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
-
-   /**
-    * Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
-    * functions.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
-    */
-   SYSTEM_VALUE_BASE_VERTEX,
-
-   /**
-    * Depending on the type of the draw call (indexed or non-indexed),
-    * is the value of \c basevertex passed to \c glDrawElementsBaseVertex and
-    * similar, or is the value of \c first passed to \c glDrawArrays and
-    * similar.
-    *
-    * \note
-    * It can be used to calculate the \c SYSTEM_VALUE_VERTEX_ID as
-    * \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus \c SYSTEM_VALUE_FIRST_VERTEX.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_VERTEX_ID
-    */
-   SYSTEM_VALUE_FIRST_VERTEX,
-
-   /**
-    * If the Draw command used to start the rendering was an indexed draw
-    * or not (~0/0). Useful to calculate \c SYSTEM_VALUE_BASE_VERTEX as
-    * \c SYSTEM_VALUE_IS_INDEXED_DRAW & \c SYSTEM_VALUE_FIRST_VERTEX.
-    */
-   SYSTEM_VALUE_IS_INDEXED_DRAW,
-
-   /**
-    * Value of \c baseinstance passed to instanced draw entry points
-    *
-    * \sa SYSTEM_VALUE_INSTANCE_ID
-    */
-   SYSTEM_VALUE_BASE_INSTANCE,
-
-   /**
-    * From _ARB_shader_draw_parameters:
-    *
-    *   "Additionally, this extension adds a further built-in variable,
-    *    gl_DrawID to the shading language. This variable contains the index
-    *    of the draw currently being processed by a Multi* variant of a
-    *    drawing command (such as MultiDrawElements or
-    *    MultiDrawArraysIndirect)."
-    *
-    * If GL_ARB_multi_draw_indirect is not supported, this is always 0.
-    */
-   SYSTEM_VALUE_DRAW_ID,
-   /*@}*/
-
-   /**
-    * \name Geometry shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_INVOCATION_ID,  /**< (Also in Tessellation Control shader) */
-   /*@}*/
-
-   /**
-    * \name Fragment shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_FRAG_COORD,
-   SYSTEM_VALUE_POINT_COORD,
-   SYSTEM_VALUE_FRONT_FACE,
-   SYSTEM_VALUE_SAMPLE_ID,
-   SYSTEM_VALUE_SAMPLE_POS,
-   SYSTEM_VALUE_SAMPLE_MASK_IN,
-   SYSTEM_VALUE_HELPER_INVOCATION,
-   SYSTEM_VALUE_COLOR0,
-   SYSTEM_VALUE_COLOR1,
-   /*@}*/
-
-   /**
-    * \name Tessellation Evaluation shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_TESS_COORD,
-   SYSTEM_VALUE_VERTICES_IN,    /**< Tessellation vertices in input patch */
-   SYSTEM_VALUE_PRIMITIVE_ID,
-   SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
-   SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
-   SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT, /**< TCS input for passthru TCS */
-   SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT, /**< TCS input for passthru TCS */
-   /*@}*/
-
-   /**
-    * \name Compute shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_LOCAL_INVOCATION_ID,
-   SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
-   SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
-   SYSTEM_VALUE_GLOBAL_INVOCATION_INDEX,
-   SYSTEM_VALUE_WORK_GROUP_ID,
-   SYSTEM_VALUE_NUM_WORK_GROUPS,
-   SYSTEM_VALUE_LOCAL_GROUP_SIZE,
-   SYSTEM_VALUE_GLOBAL_GROUP_SIZE,
-   SYSTEM_VALUE_WORK_DIM,
-   SYSTEM_VALUE_USER_DATA_AMD,
-   /*@}*/
-
-   /** Required for VK_KHR_device_group */
-   SYSTEM_VALUE_DEVICE_INDEX,
-
-   /** Required for VK_KHX_multiview */
-   SYSTEM_VALUE_VIEW_INDEX,
-
-   /**
-    * Driver internal vertex-count, used (for example) for drivers to
-    * calculate stride for stream-out outputs.  Not externally visible.
-    */
-   SYSTEM_VALUE_VERTEX_CNT,
-
-   /**
-    * Required for AMD_shader_explicit_vertex_parameter and also used for
-    * varying-fetch instructions.
-    *
-    * The _SIZE value is "primitive size", used to scale i/j in primitive
-    * space to pixel space.
-    */
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE,
-   SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL,
-
-   /**
-    * IR3 specific geometry shader and tesselation control shader system
-    * values that packs invocation id, thread id and vertex id.  Having this
-    * as a nir level system value lets us do the unpacking in nir.
-    */
-   SYSTEM_VALUE_GS_HEADER_IR3,
-   SYSTEM_VALUE_TCS_HEADER_IR3,
-
-   SYSTEM_VALUE_MAX             /**< Number of values */
-} gl_system_value;
-
-const char *gl_system_value_name(gl_system_value sysval);
-
-/**
- * The possible interpolation qualifiers that can be applied to a fragment
- * shader input in GLSL.
- *
- * Note: INTERP_MODE_NONE must be 0 so that memsetting the
- * ir_variable data structure to 0 causes the default behavior.
- */
-enum glsl_interp_mode
-{
-   INTERP_MODE_NONE = 0,
-   INTERP_MODE_SMOOTH,
-   INTERP_MODE_FLAT,
-   INTERP_MODE_NOPERSPECTIVE,
-   INTERP_MODE_EXPLICIT,
-   INTERP_MODE_COUNT /**< Number of interpolation qualifiers */
-};
-
-enum glsl_interface_packing {
-   GLSL_INTERFACE_PACKING_STD140,
-   GLSL_INTERFACE_PACKING_SHARED,
-   GLSL_INTERFACE_PACKING_PACKED,
-   GLSL_INTERFACE_PACKING_STD430
-};
-
-const char *glsl_interp_mode_name(enum glsl_interp_mode qual);
-
-/**
- * Fragment program results
- */
-typedef enum
-{
-   FRAG_RESULT_DEPTH = 0,
-   FRAG_RESULT_STENCIL = 1,
-   /* If a single color should be written to all render targets, this
-    * register is written.  No FRAG_RESULT_DATAn will be written.
-    */
-   FRAG_RESULT_COLOR = 2,
-   FRAG_RESULT_SAMPLE_MASK = 3,
-
-   /* FRAG_RESULT_DATAn are the per-render-target (GLSL gl_FragData[n]
-    * or ARB_fragment_program fragment.color[n]) color results.  If
-    * any are written, FRAG_RESULT_COLOR will not be written.
-    * FRAG_RESULT_DATA1 and up are simply for the benefit of
-    * gl_frag_result_name() and not to be construed as an upper bound
-    */
-   FRAG_RESULT_DATA0 = 4,
-   FRAG_RESULT_DATA1,
-   FRAG_RESULT_DATA2,
-   FRAG_RESULT_DATA3,
-   FRAG_RESULT_DATA4,
-   FRAG_RESULT_DATA5,
-   FRAG_RESULT_DATA6,
-   FRAG_RESULT_DATA7,
-} gl_frag_result;
-
-const char *gl_frag_result_name(gl_frag_result result);
-
-#define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
-
-/**
- * \brief Layout qualifiers for gl_FragDepth.
- *
- * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
- * a layout qualifier.
- *
- * \see enum ir_depth_layout
- */
-enum gl_frag_depth_layout
-{
-   FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
-   FRAG_DEPTH_LAYOUT_ANY,
-   FRAG_DEPTH_LAYOUT_GREATER,
-   FRAG_DEPTH_LAYOUT_LESS,
-   FRAG_DEPTH_LAYOUT_UNCHANGED
-};
-
-/**
- * \brief Buffer access qualifiers
- */
-enum gl_access_qualifier
-{
-   ACCESS_COHERENT      = (1 << 0),
-   ACCESS_RESTRICT      = (1 << 1),
-   ACCESS_VOLATILE      = (1 << 2),
-   ACCESS_NON_READABLE  = (1 << 3),
-   ACCESS_NON_WRITEABLE = (1 << 4),
-
-   /** The access may use a non-uniform buffer or image index */
-   ACCESS_NON_UNIFORM   = (1 << 5),
-
-   /* This has the same semantics as NIR_INTRINSIC_CAN_REORDER, only to be
-    * used with loads. In other words, it means that the load can be
-    * arbitrarily reordered, or combined with other loads to the same address.
-    * It is implied by ACCESS_NON_WRITEABLE together with ACCESS_RESTRICT, and
-    * a lack of ACCESS_COHERENT and ACCESS_VOLATILE.
-    */
-   ACCESS_CAN_REORDER = (1 << 6),
-
-   /** Use as little cache space as possible. */
-   ACCESS_STREAM_CACHE_POLICY = (1 << 7),
-};
-
-/**
- * \brief Blend support qualifiers
- */
-enum gl_advanced_blend_mode
-{
-   BLEND_NONE           = 0x0000,
-
-   BLEND_MULTIPLY       = 0x0001,
-   BLEND_SCREEN         = 0x0002,
-   BLEND_OVERLAY        = 0x0004,
-   BLEND_DARKEN         = 0x0008,
-   BLEND_LIGHTEN        = 0x0010,
-   BLEND_COLORDODGE     = 0x0020,
-   BLEND_COLORBURN      = 0x0040,
-   BLEND_HARDLIGHT      = 0x0080,
-   BLEND_SOFTLIGHT      = 0x0100,
-   BLEND_DIFFERENCE     = 0x0200,
-   BLEND_EXCLUSION      = 0x0400,
-   BLEND_HSL_HUE        = 0x0800,
-   BLEND_HSL_SATURATION = 0x1000,
-   BLEND_HSL_COLOR      = 0x2000,
-   BLEND_HSL_LUMINOSITY = 0x4000,
-
-   BLEND_ALL            = 0x7fff,
-};
-
-enum blend_func
-{
-   BLEND_FUNC_ADD,
-   BLEND_FUNC_SUBTRACT,
-   BLEND_FUNC_REVERSE_SUBTRACT,
-   BLEND_FUNC_MIN,
-   BLEND_FUNC_MAX,
-};
-
-enum blend_factor
-{
-   BLEND_FACTOR_ZERO,
-   BLEND_FACTOR_SRC_COLOR,
-   BLEND_FACTOR_DST_COLOR,
-   BLEND_FACTOR_SRC_ALPHA,
-   BLEND_FACTOR_DST_ALPHA,
-   BLEND_FACTOR_CONSTANT_COLOR,
-   BLEND_FACTOR_CONSTANT_ALPHA,
-   BLEND_FACTOR_SRC_ALPHA_SATURATE,
-};
-
-enum gl_tess_spacing
-{
-   TESS_SPACING_UNSPECIFIED,
-   TESS_SPACING_EQUAL,
-   TESS_SPACING_FRACTIONAL_ODD,
-   TESS_SPACING_FRACTIONAL_EVEN,
-};
-
-/**
- * A compare function enum for use in compiler lowering passes.  This is in
- * the same order as GL's compare functions (shifted down by GL_NEVER), and is
- * exactly the same as gallium's PIPE_FUNC_*.
- */
-enum compare_func
-{
-   COMPARE_FUNC_NEVER,
-   COMPARE_FUNC_LESS,
-   COMPARE_FUNC_EQUAL,
-   COMPARE_FUNC_LEQUAL,
-   COMPARE_FUNC_GREATER,
-   COMPARE_FUNC_NOTEQUAL,
-   COMPARE_FUNC_GEQUAL,
-   COMPARE_FUNC_ALWAYS,
-};
-
-/**
- * Arrangements for grouping invocations from NV_compute_shader_derivatives.
- *
- *   The extension provides new layout qualifiers that support two different
- *   arrangements of compute shader invocations for the purpose of derivative
- *   computation.  When specifying
- *
- *     layout(derivative_group_quadsNV) in;
- *
- *   compute shader invocations are grouped into 2x2x1 arrays whose four local
- *   invocation ID values follow the pattern:
- *
- *       +-----------------+------------------+
- *       | (2x+0, 2y+0, z) |  (2x+1, 2y+0, z) |
- *       +-----------------+------------------+
- *       | (2x+0, 2y+1, z) |  (2x+1, 2y+1, z) |
- *       +-----------------+------------------+
- *
- *   where Y increases from bottom to top.  When specifying
- *
- *     layout(derivative_group_linearNV) in;
- *
- *   compute shader invocations are grouped into 2x2x1 arrays whose four local
- *   invocation index values follow the pattern:
- *
- *       +------+------+
- *       | 4n+0 | 4n+1 |
- *       +------+------+
- *       | 4n+2 | 4n+3 |
- *       +------+------+
- *
- *   If neither layout qualifier is specified, derivatives in compute shaders
- *   return zero, which is consistent with the handling of built-in texture
- *   functions like texture() in GLSL 4.50 compute shaders.
- */
-enum gl_derivative_group {
-   DERIVATIVE_GROUP_NONE = 0,
-   DERIVATIVE_GROUP_QUADS,
-   DERIVATIVE_GROUP_LINEAR,
-};
-
-enum float_controls
-{
-   FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE        = 0x0000,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP16              = 0x0001,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP32              = 0x0002,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP64              = 0x0004,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16         = 0x0008,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32         = 0x0010,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64         = 0x0020,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 = 0x0040,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 = 0x0080,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64 = 0x0100,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16            = 0x0200,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32            = 0x0400,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64            = 0x0800,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16            = 0x1000,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32            = 0x2000,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64            = 0x4000,
-};
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* SHADER_ENUMS_H */
--- a/extra/disassemblers/adreno/util/bitscan.h
+++ b/extra/disassemblers/adreno/util/bitscan.h
@@ -1,326 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef BITSCAN_H
-#define BITSCAN_H
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
-#if defined(__POPCNT__)
-#include <popcntintrin.h>
-#endif
-
-//#include "c99_compat.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/**
- * Find first bit set in word.  Least significant bit is 1.
- * Return 0 if no bits set.
- */
-#ifdef HAVE___BUILTIN_FFS
-#define ffs __builtin_ffs
-#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
-static inline
-int ffs(int i)
-{
-   unsigned long index;
-   if (_BitScanForward(&index, i))
-      return index + 1;
-   else
-      return 0;
-}
-#else
-extern
-int ffs(int i);
-#endif
-
-#ifdef HAVE___BUILTIN_FFSLL
-#define ffsll __builtin_ffsll
-#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
-static inline int
-ffsll(long long int i)
-{
-   unsigned long index;
-   if (_BitScanForward64(&index, i))
-      return index + 1;
-   else
-      return 0;
-}
-#else
-extern int
-ffsll(long long int val);
-#endif
-
-
-/* Destructively loop over all of the bits in a mask as in:
- *
- * while (mymask) {
- *   int i = u_bit_scan(&mymask);
- *   ... process element i
- * }
- *
- */
-static inline int
-u_bit_scan(unsigned *mask)
-{
-   const int i = ffs(*mask) - 1;
-   *mask ^= (1u << i);
-   return i;
-}
-
-static inline int
-u_bit_scan64(uint64_t *mask)
-{
-   const int i = ffsll(*mask) - 1;
-   *mask ^= (((uint64_t)1) << i);
-   return i;
-}
-
-/* Determine if an unsigned value is a power of two.
- *
- * \note
- * Zero is treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_or_zero(unsigned v)
-{
-   return (v & (v - 1)) == 0;
-}
-
-/* Determine if an uint64_t value is a power of two.
- *
- * \note
- * Zero is treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_or_zero64(uint64_t v)
-{
-   return (v & (v - 1)) == 0;
-}
-
-/* Determine if an unsigned value is a power of two.
- *
- * \note
- * Zero is \b not treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_nonzero(unsigned v)
-{
-   /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT.  The latter
-    * indicates the existence of the __builtin_popcount function.  The former
-    * indicates that _mm_popcnt_u32 exists and is a native instruction.
-    *
-    * The other alternative is to use SSE 4.2 compile-time flags.  This has
-    * two drawbacks.  First, there is currently no build infrastructure for
-    * SSE 4.2 (only 4.1), so that would have to be added.  Second, some AMD
-    * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
-    */
-#ifdef __POPCNT__
-   return _mm_popcnt_u32(v) == 1;
-#else
-   return v != 0 && (v & (v - 1)) == 0;
-#endif
-}
-
-/* For looping over a bitmask when you want to loop over consecutive bits
- * manually, for example:
- *
- * while (mask) {
- *    int start, count, i;
- *
- *    u_bit_scan_consecutive_range(&mask, &start, &count);
- *
- *    for (i = 0; i < count; i++)
- *       ... process element (start+i)
- * }
- */
-static inline void
-u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
-{
-   if (*mask == 0xffffffff) {
-      *start = 0;
-      *count = 32;
-      *mask = 0;
-      return;
-   }
-   *start = ffs(*mask) - 1;
-   *count = ffs(~(*mask >> *start)) - 1;
-   *mask &= ~(((1u << *count) - 1) << *start);
-}
-
-static inline void
-u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
-{
-   if (*mask == ~0ull) {
-      *start = 0;
-      *count = 64;
-      *mask = 0;
-      return;
-   }
-   *start = ffsll(*mask) - 1;
-   *count = ffsll(~(*mask >> *start)) - 1;
-   *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
-}
-
-
-/**
- * Find last bit set in a word.  The least significant bit is 1.
- * Return 0 if no bits are set.
- * Essentially ffs() in the reverse direction.
- */
-static inline unsigned
-util_last_bit(unsigned u)
-{
-#if defined(HAVE___BUILTIN_CLZ)
-   return u == 0 ? 0 : 32 - __builtin_clz(u);
-#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
-   unsigned long index;
-   if (_BitScanReverse(&index, u))
-      return index + 1;
-   else
-      return 0;
-#else
-   unsigned r = 0;
-   while (u) {
-      r++;
-      u >>= 1;
-   }
-   return r;
-#endif
-}
-
-/**
- * Find last bit set in a word.  The least significant bit is 1.
- * Return 0 if no bits are set.
- * Essentially ffsll() in the reverse direction.
- */
-static inline unsigned
-util_last_bit64(uint64_t u)
-{
-#if defined(HAVE___BUILTIN_CLZLL)
-   return u == 0 ? 0 : 64 - __builtin_clzll(u);
-#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
-   unsigned long index;
-   if (_BitScanReverse64(&index, u))
-      return index + 1;
-   else
-      return 0;
-#else
-   unsigned r = 0;
-   while (u) {
-      r++;
-      u >>= 1;
-   }
-   return r;
-#endif
-}
-
-/**
- * Find last bit in a word that does not match the sign bit. The least
- * significant bit is 1.
- * Return 0 if no bits are set.
- */
-static inline unsigned
-util_last_bit_signed(int i)
-{
-   if (i >= 0)
-      return util_last_bit(i);
-   else
-      return util_last_bit(~(unsigned)i);
-}
-
-/* Returns a bitfield in which the first count bits starting at start are
- * set.
- */
-static inline unsigned
-u_bit_consecutive(unsigned start, unsigned count)
-{
-   assert(start + count <= 32);
-   if (count == 32)
-      return ~0;
-   return ((1u << count) - 1) << start;
-}
-
-static inline uint64_t
-u_bit_consecutive64(unsigned start, unsigned count)
-{
-   assert(start + count <= 64);
-   if (count == 64)
-      return ~(uint64_t)0;
-   return (((uint64_t)1 << count) - 1) << start;
-}
-
-/**
- * Return number of bits set in n.
- */
-static inline unsigned
-util_bitcount(unsigned n)
-{
-#if defined(HAVE___BUILTIN_POPCOUNT)
-   return __builtin_popcount(n);
-#else
-   /* K&R classic bitcount.
-    *
-    * For each iteration, clear the LSB from the bitfield.
-    * Requires only one iteration per set bit, instead of
-    * one iteration per bit less than highest set bit.
-    */
-   unsigned bits;
-   for (bits = 0; n; bits++) {
-      n &= n - 1;
-   }
-   return bits;
-#endif
-}
-
-static inline unsigned
-util_bitcount64(uint64_t n)
-{
-#ifdef HAVE___BUILTIN_POPCOUNTLL
-   return __builtin_popcountll(n);
-#else
-   return util_bitcount(n) + util_bitcount(n >> 32);
-#endif
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* BITSCAN_H */
--- a/extra/disassemblers/adreno/util/bitset.h
+++ b/extra/disassemblers/adreno/util/bitset.h
@@ -1,261 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2006  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file bitset.h
- * \brief Bitset of arbitrary size definitions.
- * \author Michal Krol
- */
-
-#ifndef BITSET_H
-#define BITSET_H
-
-//#include "util/bitscan.h"
-//#include "util/macros.h"
-
-/****************************************************************************
- * generic bitset implementation
- */
-
-#define BITSET_WORD unsigned int
-#define BITSET_WORDBITS (sizeof (BITSET_WORD) * 8)
-
-/* bitset declarations
- */
-#define BITSET_WORDS(bits) (((bits) + BITSET_WORDBITS - 1) / BITSET_WORDBITS)
-#define BITSET_DECLARE(name, bits) BITSET_WORD name[BITSET_WORDS(bits)]
-
-/* bitset operations
- */
-#define BITSET_COPY(x, y) memcpy( (x), (y), sizeof (x) )
-#define BITSET_EQUAL(x, y) (memcmp( (x), (y), sizeof (x) ) == 0)
-#define BITSET_ZERO(x) memset( (x), 0, sizeof (x) )
-#define BITSET_ONES(x) memset( (x), 0xff, sizeof (x) )
-
-#define BITSET_BITWORD(b) ((b) / BITSET_WORDBITS)
-#define BITSET_BIT(b) (1u << ((b) % BITSET_WORDBITS))
-
-/* single bit operations
- */
-#define BITSET_TEST(x, b) (((x)[BITSET_BITWORD(b)] & BITSET_BIT(b)) != 0)
-#define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
-#define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
-
-#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
-#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
-
-/* bit range operations
- */
-#define BITSET_TEST_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   (((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \
-   (assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0))
-#define BITSET_SET_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   ((x)[BITSET_BITWORD(b)] |= BITSET_RANGE(b, e)) : \
-   (assert (!"BITSET_SET_RANGE: bit range crosses word boundary"), 0))
-#define BITSET_CLEAR_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   ((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
-   (assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
-
-/* Get first bit set in a bitset.
- */
-static inline int
-__bitset_ffs(const BITSET_WORD *x, int n)
-{
-   int i;
-
-   for (i = 0; i < n; i++) {
-      if (x[i])
-	 return ffs(x[i]) + BITSET_WORDBITS * i;
-   }
-
-   return 0;
-}
-
-#define BITSET_FFS(x) __bitset_ffs(x, ARRAY_SIZE(x))
-
-static inline unsigned
-__bitset_next_set(unsigned i, BITSET_WORD *tmp,
-                  const BITSET_WORD *set, unsigned size)
-{
-   unsigned bit, word;
-
-   /* NOTE: The initial conditions for this function are very specific.  At
-    * the start of the loop, the tmp variable must be set to *set and the
-    * initial i value set to 0.  This way, if there is a bit set in the first
-    * word, we ignore the i-value and just grab that bit (so 0 is ok, even
-    * though 0 may be returned).  If the first word is 0, then the value of
-    * `word` will be 0 and we will go on to look at the second word.
-    */
-   word = BITSET_BITWORD(i);
-   while (*tmp == 0) {
-      word++;
-
-      if (word >= BITSET_WORDS(size))
-         return size;
-
-      *tmp = set[word];
-   }
-
-   /* Find the next set bit in the non-zero word */
-   bit = ffs(*tmp) - 1;
-
-   /* Unset the bit */
-   *tmp &= ~(1ull << bit);
-
-   return word * BITSET_WORDBITS + bit;
-}
-
-/**
- * Iterates over each set bit in a set
- *
- * @param __i    iteration variable, bit number
- * @param __set  the bitset to iterate (will not be modified)
- * @param __size number of bits in the set to consider
- */
-#define BITSET_FOREACH_SET(__i, __set, __size) \
-   for (BITSET_WORD __tmp = *(__set), *__foo = &__tmp; __foo != NULL; __foo = NULL) \
-      for (__i = 0; \
-           (__i = __bitset_next_set(__i, &__tmp, __set, __size)) < __size;)
-
-#ifdef __cplusplus
-
-/**
- * Simple C++ wrapper of a bitset type of static size, with value semantics
- * and basic bitwise arithmetic operators.  The operators defined below are
- * expected to have the same semantics as the same operator applied to other
- * fundamental integer types.  T is the name of the struct to instantiate
- * it as, and N is the number of bits in the bitset.
- */
-#define DECLARE_BITSET_T(T, N) struct T {                       \
-      EXPLICIT_CONVERSION                                       \
-      operator bool() const                                     \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            if (words[i])                                       \
-               return true;                                     \
-         return false;                                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator=(int x)                                          \
-      {                                                         \
-         const T c = {{ (BITSET_WORD)x }};                      \
-         return *this = c;                                      \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator==(const T &b, const T &c)                        \
-      {                                                         \
-         return BITSET_EQUAL(b.words, c.words);                 \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator!=(const T &b, const T &c)                        \
-      {                                                         \
-         return !(b == c);                                      \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator==(const T &b, int x)                             \
-      {                                                         \
-         const T c = {{ (BITSET_WORD)x }};                      \
-         return b == c;                                         \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator!=(const T &b, int x)                             \
-      {                                                         \
-         return !(b == x);                                      \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator~(const T &b)                                     \
-      {                                                         \
-         T c;                                                   \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            c.words[i] = ~b.words[i];                           \
-         return c;                                              \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator|=(const T &b)                                    \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            words[i] |= b.words[i];                             \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator|(const T &b, const T &c)                         \
-      {                                                         \
-         T d = b;                                               \
-         d |= c;                                                \
-         return d;                                              \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator&=(const T &b)                                    \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            words[i] &= b.words[i];                             \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator&(const T &b, const T &c)                         \
-      {                                                         \
-         T d = b;                                               \
-         d &= c;                                                \
-         return d;                                              \
-      }                                                         \
-                                                                \
-      bool                                                      \
-      test(unsigned i) const                                    \
-      {                                                         \
-         return BITSET_TEST(words, i);                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      set(unsigned i)                                           \
-      {                                                         \
-         BITSET_SET(words, i);                                  \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      clear(unsigned i)                                         \
-      {                                                         \
-         BITSET_CLEAR(words, i);                                \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      BITSET_WORD words[BITSET_WORDS(N)];                       \
-   }
-
-#endif
-
-#endif
--- a/extra/disassemblers/adreno/util/list.h
+++ b/extra/disassemblers/adreno/util/list.h
@@ -1,262 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2006 VMware, Inc., Bismarck, ND. USA.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- **************************************************************************/
-
-/**
- * \file
- * List macros heavily inspired by the Linux kernel
- * list handling. No list looping yet.
- *
- * Is not threadsafe, so common operations need to
- * be protected using an external mutex.
- */
-
-#ifndef _UTIL_LIST_H_
-#define _UTIL_LIST_H_
-
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <assert.h>
-
-#ifdef DEBUG
-#  define list_assert(cond, msg)  assert(cond && msg)
-#else
-#  define list_assert(cond, msg)  (void)(0 && (cond))
-#endif
-
-struct list_head
-{
-    struct list_head *prev;
-    struct list_head *next;
-};
-
-static inline void list_inithead(struct list_head *item)
-{
-    item->prev = item;
-    item->next = item;
-}
-
-static inline void list_add(struct list_head *item, struct list_head *list)
-{
-    item->prev = list;
-    item->next = list->next;
-    list->next->prev = item;
-    list->next = item;
-}
-
-static inline void list_addtail(struct list_head *item, struct list_head *list)
-{
-    item->next = list;
-    item->prev = list->prev;
-    list->prev->next = item;
-    list->prev = item;
-}
-
-static inline bool list_is_empty(const struct list_head *list);
-
-static inline void list_replace(struct list_head *from, struct list_head *to)
-{
-    if (list_is_empty(from)) {
-        list_inithead(to);
-    } else {
-        to->prev = from->prev;
-        to->next = from->next;
-        from->next->prev = to;
-        from->prev->next = to;
-    }
-}
-
-static inline void list_del(struct list_head *item)
-{
-    item->prev->next = item->next;
-    item->next->prev = item->prev;
-    item->prev = item->next = NULL;
-}
-
-static inline void list_delinit(struct list_head *item)
-{
-    item->prev->next = item->next;
-    item->next->prev = item->prev;
-    item->next = item;
-    item->prev = item;
-}
-
-static inline bool list_is_empty(const struct list_head *list)
-{
-   return list->next == list;
-}
-
-/**
- * Returns whether the list has exactly one element.
- */
-static inline bool list_is_singular(const struct list_head *list)
-{
-   return list->next != NULL && list->next != list && list->next->next == list;
-}
-
-static inline unsigned list_length(const struct list_head *list)
-{
-   struct list_head *node;
-   unsigned length = 0;
-   for (node = list->next; node != list; node = node->next)
-      length++;
-   return length;
-}
-
-static inline void list_splice(struct list_head *src, struct list_head *dst)
-{
-   if (list_is_empty(src))
-      return;
-
-   src->next->prev = dst;
-   src->prev->next = dst->next;
-   dst->next->prev = src->prev;
-   dst->next = src->next;
-}
-
-static inline void list_splicetail(struct list_head *src, struct list_head *dst)
-{
-   if (list_is_empty(src))
-      return;
-
-   src->prev->next = dst;
-   src->next->prev = dst->prev;
-   dst->prev->next = src->next;
-   dst->prev = src->prev;
-}
-
-static inline void list_validate(const struct list_head *list)
-{
-   struct list_head *node;
-   assert(list->next->prev == list && list->prev->next == list);
-   for (node = list->next; node != list; node = node->next)
-      assert(node->next->prev == node && node->prev->next == node);
-}
-
-#define LIST_ENTRY(__type, __item, __field)   \
-    ((__type *)(((char *)(__item)) - offsetof(__type, __field)))
-
-/**
- * Cast from a pointer to a member of a struct back to the containing struct.
- *
- * 'sample' MUST be initialized, or else the result is undefined!
- */
-#ifndef container_of
-#define container_of(ptr, sample, member)				\
-    (void *)((char *)(ptr)						\
-	     - ((char *)&(sample)->member - (char *)(sample)))
-#endif
-
-#define list_first_entry(ptr, type, member) \
-        LIST_ENTRY(type, (ptr)->next, member)
-
-#define list_last_entry(ptr, type, member) \
-        LIST_ENTRY(type, (ptr)->prev, member)
-
-
-#define LIST_FOR_EACH_ENTRY(pos, head, member)				\
-   for (pos = NULL, pos = container_of((head)->next, pos, member);	\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.next, pos, member))
-
-#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
-   for (pos = NULL, pos = container_of((head)->next, pos, member),	\
-	storage = container_of(pos->member.next, pos, member);	\
-	&pos->member != (head);						\
-	pos = storage, storage = container_of(storage->member.next, storage, member))
-
-#define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member)	\
-   for (pos = NULL, pos = container_of((head)->prev, pos, member),	\
-	storage = container_of(pos->member.prev, pos, member);		\
-	&pos->member != (head);						\
-	pos = storage, storage = container_of(storage->member.prev, storage, member))
-
-#define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member)		\
-   for (pos = NULL, pos = container_of((start), pos, member);		\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.next, pos, member))
-
-#define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member)		\
-   for (pos = NULL, pos = container_of((start), pos, member);		\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.prev, pos, member))
-
-#define list_for_each_entry(type, pos, head, member)                    \
-   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.next, member),               \
-	list_assert(pos == __next, "use _safe iterator"),               \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_safe(type, pos, head, member)               \
-   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = __next,                                                   \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_rev(type, pos, head, member)                \
-   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
-	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.prev, member),               \
-	list_assert(pos == __prev, "use _safe iterator"),               \
-	__prev = LIST_ENTRY(type, __prev->member.prev, member))
-
-#define list_for_each_entry_safe_rev(type, pos, head, member)           \
-   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
-	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
-	&pos->member != (head);                                         \
-	pos = __prev,                                                   \
-        __prev = LIST_ENTRY(type, __prev->member.prev, member))
-
-#define list_for_each_entry_from(type, pos, start, head, member)        \
-   for (type *pos = LIST_ENTRY(type, (start), member);                  \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.next, member))
-
-#define list_for_each_entry_from_safe(type, pos, start, head, member)   \
-   for (type *pos = LIST_ENTRY(type, (start), member),                  \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = __next,                                                   \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_from_rev(type, pos, start, head, member)    \
-   for (type *pos = LIST_ENTRY(type, (start), member);                  \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.prev, member))
-
-#define list_pair_for_each_entry(type, pos1, pos2, head1, head2, member) \
-   for (type *pos1 = LIST_ENTRY(type, (head1)->next, member),           \
-             *pos2 = LIST_ENTRY(type, (head2)->next, member);           \
-        &pos1->member != (head1) && &pos2->member != (head2);           \
-	pos1 = LIST_ENTRY(type, pos1->member.next, member),               \
-	pos2 = LIST_ENTRY(type, pos2->member.next, member))
-
-#endif /*_UTIL_LIST_H_*/
--- a/extra/disassemblers/adreno/util/macros.h
+++ b/extra/disassemblers/adreno/util/macros.h
@@ -1,346 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef UTIL_MACROS_H
-#define UTIL_MACROS_H
-
-#include <assert.h>
-
-/* Compute the size of an array */
-#ifndef ARRAY_SIZE
-#  define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
-/* For compatibility with Clang's __has_builtin() */
-#ifndef __has_builtin
-#  define __has_builtin(x) 0
-#endif
-
-/**
- * __builtin_expect macros
- */
-#if !defined(HAVE___BUILTIN_EXPECT)
-#  define __builtin_expect(x, y) (x)
-#endif
-
-#ifndef likely
-#  ifdef HAVE___BUILTIN_EXPECT
-#    define likely(x)   __builtin_expect(!!(x), 1)
-#    define unlikely(x) __builtin_expect(!!(x), 0)
-#  else
-#    define likely(x)   (x)
-#    define unlikely(x) (x)
-#  endif
-#endif
-
-
-/**
- * Static (compile-time) assertion.
- * Basically, use COND to dimension an array.  If COND is false/zero the
- * array size will be -1 and we'll get a compilation error.
- */
-#define STATIC_ASSERT(COND) \
-   do { \
-      (void) sizeof(char [1 - 2*!(COND)]); \
-   } while (0)
-
-
-/**
- * Unreachable macro. Useful for suppressing "control reaches end of non-void
- * function" warnings.
- */
-#if defined(HAVE___BUILTIN_UNREACHABLE) || __has_builtin(__builtin_unreachable)
-#define unreachable(str)    \
-do {                        \
-   assert(!str);            \
-   __builtin_unreachable(); \
-} while (0)
-#elif defined (_MSC_VER)
-#define unreachable(str)    \
-do {                        \
-   assert(!str);            \
-   __assume(0);             \
-} while (0)
-#else
-#define unreachable(str) assert(!str)
-#endif
-
-/**
- * Assume macro. Useful for expressing our assumptions to the compiler,
- * typically for purposes of silencing warnings.
- */
-#if __has_builtin(__builtin_assume)
-#define assume(expr)       \
-do {                       \
-   assert(expr);           \
-   __builtin_assume(expr); \
-} while (0)
-#elif defined HAVE___BUILTIN_UNREACHABLE
-#define assume(expr) ((expr) ? ((void) 0) \
-                             : (assert(!"assumption failed"), \
-                                __builtin_unreachable()))
-#elif defined (_MSC_VER)
-#define assume(expr) __assume(expr)
-#else
-#define assume(expr) assert(expr)
-#endif
-
-/* Attribute const is used for functions that have no effects other than their
- * return value, and only rely on the argument values to compute the return
- * value.  As a result, calls to it can be CSEed.  Note that using memory
- * pointed to by the arguments is not allowed for const functions.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_CONST
-#define ATTRIBUTE_CONST __attribute__((__const__))
-#else
-#define ATTRIBUTE_CONST
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
-#define FLATTEN __attribute__((__flatten__))
-#else
-#define FLATTEN
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_FORMAT
-#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
-#else
-#define PRINTFLIKE(f, a)
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC
-#define MALLOCLIKE __attribute__((__malloc__))
-#else
-#define MALLOCLIKE
-#endif
-
-/* Forced function inlining */
-/* Note: Clang also sets __GNUC__ (see other cases below) */
-#ifndef ALWAYS_INLINE
-#  if defined(__GNUC__)
-#    define ALWAYS_INLINE inline __attribute__((always_inline))
-#  elif defined(_MSC_VER)
-#    define ALWAYS_INLINE __forceinline
-#  else
-#    define ALWAYS_INLINE inline
-#  endif
-#endif
-
-/* Used to optionally mark structures with misaligned elements or size as
- * packed, to trade off performance for space.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_PACKED
-#define PACKED __attribute__((__packed__))
-#else
-#define PACKED
-#endif
-
-/* Attribute pure is used for functions that have no effects other than their
- * return value.  As a result, calls to it can be dead code eliminated.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_PURE
-#define ATTRIBUTE_PURE __attribute__((__pure__))
-#else
-#define ATTRIBUTE_PURE
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
-#define ATTRIBUTE_RETURNS_NONNULL __attribute__((__returns_nonnull__))
-#else
-#define ATTRIBUTE_RETURNS_NONNULL
-#endif
-
-#ifndef NORETURN
-#  ifdef _MSC_VER
-#    define NORETURN __declspec(noreturn)
-#  elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
-#    define NORETURN __attribute__((__noreturn__))
-#  else
-#    define NORETURN
-#  endif
-#endif
-
-#ifdef __cplusplus
-/**
- * Macro function that evaluates to true if T is a trivially
- * destructible type -- that is, if its (non-virtual) destructor
- * performs no action and all member variables and base classes are
- * trivially destructible themselves.
- */
-#   if (defined(__clang__) && defined(__has_feature))
-#      if __has_feature(has_trivial_destructor)
-#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#      endif
-#   elif defined(__GNUC__)
-#      if ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
-#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#      endif
-#   elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#      define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#   endif
-#   ifndef HAS_TRIVIAL_DESTRUCTOR
-       /* It's always safe (if inefficient) to assume that a
-        * destructor is non-trivial.
-        */
-#      define HAS_TRIVIAL_DESTRUCTOR(T) (false)
-#   endif
-#endif
-
-/**
- * PUBLIC/USED macros
- *
- * If we build the library with gcc's -fvisibility=hidden flag, we'll
- * use the PUBLIC macro to mark functions that are to be exported.
- *
- * We also need to define a USED attribute, so the optimizer doesn't
- * inline a static function that we later use in an alias. - ajax
- */
-#ifndef PUBLIC
-#  if defined(__GNUC__)
-#    define PUBLIC __attribute__((visibility("default")))
-#    define USED __attribute__((used))
-#  elif defined(_MSC_VER)
-#    define PUBLIC __declspec(dllexport)
-#    define USED
-#  else
-#    define PUBLIC
-#    define USED
-#  endif
-#endif
-
-/**
- * UNUSED marks variables (or sometimes functions) that have to be defined,
- * but are sometimes (or always) unused beyond that. A common case is for
- * a function parameter to be used in some build configurations but not others.
- * Another case is fallback vfuncs that don't do anything with their params.
- *
- * Note that this should not be used for identifiers used in `assert()`;
- * see ASSERTED below.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_UNUSED
-#define UNUSED __attribute__((unused))
-#else
-#define UNUSED
-#endif
-
-/**
- * Use ASSERTED to indicate that an identifier is unused outside of an `assert()`,
- * so that assert-free builds don't get "unused variable" warnings.
- */
-#ifdef NDEBUG
-#define ASSERTED UNUSED
-#else
-#define ASSERTED
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
-#define MUST_CHECK __attribute__((warn_unused_result))
-#else
-#define MUST_CHECK
-#endif
-
-#if defined(__GNUC__)
-#define ATTRIBUTE_NOINLINE __attribute__((noinline))
-#else
-#define ATTRIBUTE_NOINLINE
-#endif
-
-
-/**
- * Check that STRUCT::FIELD can hold MAXVAL.  We use a lot of bitfields
- * in Mesa/gallium.  We have to be sure they're of sufficient size to
- * hold the largest expected value.
- * Note that with MSVC, enums are signed and enum bitfields need one extra
- * high bit (always zero) to ensure the max value is handled correctly.
- * This macro will detect that with MSVC, but not GCC.
- */
-#define ASSERT_BITFIELD_SIZE(STRUCT, FIELD, MAXVAL) \
-   do { \
-      ASSERTED STRUCT s; \
-      s.FIELD = (MAXVAL); \
-      assert((int) s.FIELD == (MAXVAL) && "Insufficient bitfield size!"); \
-   } while (0)
-
-
-/** Compute ceiling of integer quotient of A divided by B. */
-#define DIV_ROUND_UP( A, B )  ( ((A) + (B) - 1) / (B) )
-
-/** Clamp X to [MIN,MAX].  Turn NaN into MIN, arbitrarily. */
-#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
-
-/** Minimum of two values: */
-#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
-
-/** Maximum of two values: */
-#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-
-/** Minimum and maximum of three values: */
-#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
-#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
-
-/** Align a value to a power of two */
-#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
-
-/**
- * Macro for declaring an explicit conversion operator.  Defaults to an
- * implicit conversion if C++11 is not supported.
- */
-#if __cplusplus >= 201103L
-#define EXPLICIT_CONVERSION explicit
-#elif defined(__cplusplus)
-#define EXPLICIT_CONVERSION
-#endif
-
-/** Set a single bit */
-#define BITFIELD_BIT(b)      (1u << (b))
-/** Set all bits up to excluding bit b */
-#define BITFIELD_MASK(b)      \
-   ((b) == 32 ? (~0u) : BITFIELD_BIT((b) % 32) - 1)
-/** Set count bits starting from bit b  */
-#define BITFIELD_RANGE(b, count) \
-   (BITFIELD_MASK((b) + (count)) & ~BITFIELD_MASK(b))
-
-/** Set a single bit */
-#define BITFIELD64_BIT(b)      (1ull << (b))
-/** Set all bits up to excluding bit b */
-#define BITFIELD64_MASK(b)      \
-   ((b) == 64 ? (~0ull) : BITFIELD64_BIT(b) - 1)
-/** Set count bits starting from bit b  */
-#define BITFIELD64_RANGE(b, count) \
-   (BITFIELD64_MASK((b) + (count)) & ~BITFIELD64_MASK(b))
-
-/* TODO: In future we should try to move this to u_debug.h once header
- * dependencies are reorganised to allow this.
- */
-enum pipe_debug_type
-{
-   PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
-   PIPE_DEBUG_TYPE_ERROR,
-   PIPE_DEBUG_TYPE_SHADER_INFO,
-   PIPE_DEBUG_TYPE_PERF_INFO,
-   PIPE_DEBUG_TYPE_INFO,
-   PIPE_DEBUG_TYPE_FALLBACK,
-   PIPE_DEBUG_TYPE_CONFORMANCE,
-};
-
-#endif /* UTIL_MACROS_H */
--- a/test/external/external_benchmark_load_stable_diffusion.py
+++ b/test/external/external_benchmark_load_stable_diffusion.py
@@ -1,13 +0,0 @@
-from tinygrad.helpers import fetch, Timing
-from tinygrad.device import Device
-from tinygrad.nn.state import torch_load, load_state_dict
-from examples.stable_diffusion import StableDiffusion
-
-# run "sudo purge" before testing on OS X to avoid the memory cache
-
-if __name__ == "__main__":
-  fn = fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt')
-  model = StableDiffusion()
-  with Timing():
-    load_state_dict(model, torch_load(fn)['state_dict'], strict=False)
-    Device[Device.DEFAULT].synchronize()
--- a/test/external/external_test_embedding.py
+++ b/test/external/external_test_embedding.py
@@ -1,8 +0,0 @@
-from tinygrad.tensor import Tensor
-from tinygrad.nn import Embedding
-
-if __name__ == "__main__":
-  vocab_size = 50257
-  dim = 128
-  test = Embedding(vocab_size, dim)
-  ret = test(Tensor([[1,2,3]])).numpy()
--- a/test/external/external_test_hsa_driver.py
+++ b/test/external/external_test_hsa_driver.py
@@ -1,119 +0,0 @@
-import ctypes, unittest
-from tinygrad.helpers import init_c_struct_t
-from tinygrad.device import Device, Buffer
-from tinygrad.dtype import dtypes
-from tinygrad.runtime.support.hsa import AQLQueue
-from tinygrad.runtime.graph.hsa import VirtAQLQueue, HSAGraph
-from tinygrad.engine.schedule import ExecItem
-from tinygrad.engine.realize import BufferXfer
-from tinygrad.uop.ops import UOp, Ops
-
-def get_hsa_inc_prog(dev, inc=1):
-  prg = f"""
-extern "C" __attribute__((global)) void test_inc(int* data0) {{
-  data0[0] = (data0[0]+{inc});
-}}
-"""
-  return dev.runtime("test_inc", dev.compiler.compile(prg))
-
-def get_hsa_buffer_and_kernargs(dev):
-  test_buf = Buffer(Device.DEFAULT, 1, dtypes.int)
-  test_buf.copyin(memoryview(bytearray(4))) # zero mem
-  assert test_buf.as_buffer().cast('I')[0] == 0 # check mem is visible + sync to exec
-
-  args_struct_t = init_c_struct_t(tuple([('f0', ctypes.c_void_p)]))
-  kernargs = dev.alloc_kernargs(8)
-  args_st = args_struct_t.from_address(kernargs)
-  args_st.__setattr__('f0', test_buf._buf)
-  dev.flush_hdp()
-  return test_buf, kernargs
-
-@unittest.skipUnless(Device.DEFAULT == "HSA", "only run on HSA")
-class TestHSADriver(unittest.TestCase):
-  def test_hsa_simple_enqueue(self):
-    dev = Device[Device.DEFAULT]
-    queue = AQLQueue(dev, sz=256)
-
-    clprg = get_hsa_inc_prog(dev, inc=1)
-    test_buf, kernargs = get_hsa_buffer_and_kernargs(dev)
-
-    queue.submit_kernel(clprg, [1,1,1], [1,1,1], kernargs)
-    queue.wait()
-
-    assert test_buf.as_buffer().cast('I')[0] == 1, f"{test_buf.as_buffer().cast('I')[0]} != 1, all packets executed?"
-    del queue
-
-  def test_hsa_ring_enqueue(self):
-    dev = Device[Device.DEFAULT]
-
-    queue_size = 256
-    exec_cnt = int(queue_size * 1.5)
-    queue = AQLQueue(dev, sz=queue_size)
-
-    clprg_inc1 = get_hsa_inc_prog(dev, inc=1)
-    clprg_inc2 = get_hsa_inc_prog(dev, inc=2)
-    test_buf, kernargs = get_hsa_buffer_and_kernargs(dev)
-
-    for _ in range(exec_cnt):
-      queue.submit_kernel(clprg_inc1, [1,1,1], [1,1,1], kernargs)
-    for _ in range(exec_cnt):
-      queue.submit_kernel(clprg_inc2, [1,1,1], [1,1,1], kernargs)
-    queue.wait()
-
-    expected = exec_cnt + exec_cnt * 2
-    assert test_buf.as_buffer().cast('I')[0] == expected, f"{test_buf.as_buffer().cast('I')[0]} != {expected}, all packets executed?"
-    del queue
-
-  def test_hsa_blit_enqueue(self):
-    dev = Device[Device.DEFAULT]
-
-    queue_size = 256
-    exec_cnt = 178
-    queue = AQLQueue(dev, sz=queue_size)
-
-    test_buf, kernargs = get_hsa_buffer_and_kernargs(dev)
-
-    # Using VirtAQLQueue to blit them
-    virt_queue_packets_cnt = 31
-    virt_queue = VirtAQLQueue(dev, sz=virt_queue_packets_cnt)
-
-    clprogs = []
-    sum_per_blit = 0
-    for i in range(virt_queue_packets_cnt):
-      sum_per_blit += i+1
-      clprogs.append(get_hsa_inc_prog(dev, inc=i+1))
-
-    for i in range(virt_queue_packets_cnt):
-      virt_queue.submit_kernel(clprogs[i], [1,1,1], [1,1,1], kernargs)
-
-    for _ in range(exec_cnt):
-      queue.blit_packets(virt_queue.queue_base, virt_queue.packets_count)
-    queue.wait()
-
-    expected = exec_cnt * sum_per_blit
-    assert test_buf.as_buffer().cast('I')[0] == expected, f"{test_buf.as_buffer().cast('I')[0]} != {expected}, all packets executed?"
-    del queue, clprogs
-
-  def test_hsa_copies_sync(self):
-    d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"
-
-    test_buf0 = Buffer(d0, 1, dtypes.int)
-    test_buf1 = Buffer(d0, 1, dtypes.int)
-    test_buf2 = Buffer(d1, 1, dtypes.int)
-    test_buf0.copyin(memoryview(bytearray(1*4)))
-    test_buf1.copyin(memoryview(bytearray(1*4)))
-    test_buf2.copyin(memoryview(bytearray(1*4)))
-
-    jit_cache = [ExecItem(UOp(Ops.NOOP), [test_buf0, test_buf2], prg=BufferXfer(test_buf0.nbytes, test_buf0.device, test_buf2.device)),
-                 ExecItem(UOp(Ops.NOOP), [test_buf2, test_buf1], prg=BufferXfer(test_buf2.nbytes, test_buf2.device, test_buf1.device))]
-    graph = HSAGraph(jit_cache, [], {})
-
-    for i in range(10000):
-      test_buf0.copyin(memoryview(bytearray(1*4)))
-      test_buf2.copyin(memoryview(bytearray(int.to_bytes(4, length=1*4, byteorder='little'))))
-      graph([], {})
-      assert test_buf0.as_buffer().cast('I')[0] == 4
-      assert test_buf2.as_buffer().cast('I')[0] == 0
-
-if __name__ == '__main__':
-  unittest.main()
--- a/test/external/external_test_yolo.py
+++ b/test/external/external_test_yolo.py
@@ -1,32 +0,0 @@
-import unittest
-from pathlib import Path
-
-import cv2
-
-from examples.yolov3 import Darknet, infer, show_labels
-from tinygrad.helpers import fetch
-
-chicken_img = cv2.imread(str(Path(__file__).parent.parent / 'models/efficientnet/Chicken.jpg'))
-car_img = cv2.imread(str(Path(__file__).parent.parent / 'models/efficientnet/car.jpg'))
-
-class TestYOLO(unittest.TestCase):
-  @classmethod
-  def setUpClass(cls):
-    cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg").read_bytes())
-    print("Loading weights file (237MB). This might take a while…")
-    cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
-
-  @classmethod
-  def tearDownClass(cls):
-    del cls.model
-
-  def test_chicken(self):
-    labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
-    self.assertEqual(labels, ["bird"])
-
-  def test_car(self):
-    labels = show_labels(infer(self.model, car_img))
-    self.assertEqual(labels, ["car"])
-
-if __name__ == '__main__':
-  unittest.main()
--- a/test/external/graph_batchnorm.py
+++ b/test/external/graph_batchnorm.py
@@ -1,61 +0,0 @@
-import unittest
-from tinygrad.nn.state import get_parameters
-from tinygrad.tensor import Tensor
-from tinygrad.nn import Conv2d, BatchNorm2d, optim
-
-def model_step(lm):
-  with Tensor.train():
-    x = Tensor.ones(8,12,128,256, requires_grad=False)
-    optimizer = optim.SGD(get_parameters(lm), lr=0.001)
-    loss = lm.forward(x).sum()
-    optimizer.zero_grad()
-    loss.backward()
-    del x,loss
-    optimizer.step()
-
-class TestBatchnorm(unittest.TestCase):
-  def test_conv(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
-      def forward(self, x):
-        return self.c(x).relu()
-    lm = LilModel()
-    model_step(lm)
-
-  def test_two_conv(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
-        self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
-      def forward(self, x):
-        return self.c2(self.c(x)).relu()
-    lm = LilModel()
-    model_step(lm)
-
-  def test_two_conv_bn(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 24, 3, padding=1, bias=False)
-        self.bn = BatchNorm2d(24, track_running_stats=False)
-        self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
-        self.bn2 = BatchNorm2d(32, track_running_stats=False)
-      def forward(self, x):
-        x = self.bn(self.c(x)).relu()
-        return self.bn2(self.c2(x)).relu()
-    lm = LilModel()
-    model_step(lm)
-
-  def test_conv_bn(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
-        self.bn = BatchNorm2d(32, track_running_stats=False)
-      def forward(self, x):
-        return self.bn(self.c(x)).relu()
-    lm = LilModel()
-    model_step(lm)
-
-
-if __name__ == '__main__':
-  unittest.main()