From 45c459848d329ac9104bf7dfbeed647e3ac72646 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Fri, 19 Dec 2025 17:14:56 -0400
Subject: [PATCH] remove more stale stuff (#13765)

* remove more stale stuff

* remove disassemblers/adreno

* stale
---
 examples/conversation.py                      |  341 ----
 examples/mlperf/model_eval.py                 |   39 +-
 examples/vits.py                              |  740 -------
 extra/datasets/coco.py                        |  199 --
 extra/disassemblers/adreno/.gitignore         |    1 -
 extra/disassemblers/adreno/README             |    5 -
 extra/disassemblers/adreno/disasm-a3xx.c      | 1431 --------------
 extra/disassemblers/adreno/instr-a3xx.h       | 1119 -----------
 extra/disassemblers/adreno/ir3.h              | 1757 -----------------
 extra/disassemblers/adreno/shader_enums.h     |  906 ---------
 extra/disassemblers/adreno/util/bitscan.h     |  326 ---
 extra/disassemblers/adreno/util/bitset.h      |  261 ---
 extra/disassemblers/adreno/util/list.h        |  262 ---
 extra/disassemblers/adreno/util/macros.h      |  346 ----
 ...xternal_benchmark_load_stable_diffusion.py |   13 -
 test/external/external_test_embedding.py      |    8 -
 test/external/external_test_hsa_driver.py     |  119 --
 test/external/external_test_yolo.py           |   32 -
 test/external/graph_batchnorm.py              |   61 -
 19 files changed, 1 insertion(+), 7965 deletions(-)
 delete mode 100644 examples/conversation.py
 delete mode 100644 examples/vits.py
 delete mode 100644 extra/datasets/coco.py
 delete mode 100644 extra/disassemblers/adreno/.gitignore
 delete mode 100644 extra/disassemblers/adreno/README
 delete mode 100644 extra/disassemblers/adreno/disasm-a3xx.c
 delete mode 100644 extra/disassemblers/adreno/instr-a3xx.h
 delete mode 100644 extra/disassemblers/adreno/ir3.h
 delete mode 100644 extra/disassemblers/adreno/shader_enums.h
 delete mode 100644 extra/disassemblers/adreno/util/bitscan.h
 delete mode 100644 extra/disassemblers/adreno/util/bitset.h
 delete mode 100644 extra/disassemblers/adreno/util/list.h
 delete mode 100644 extra/disassemblers/adreno/util/macros.h
 delete mode 100644 test/external/external_benchmark_load_stable_diffusion.py
 delete mode 100644 test/external/external_test_embedding.py
 delete mode 100644 test/external/external_test_hsa_driver.py
 delete mode 100644 test/external/external_test_yolo.py
 delete mode 100644 test/external/graph_batchnorm.py

diff --git a/examples/conversation.py b/examples/conversation.py
deleted file mode 100644
index 8ce9adc5a8..0000000000
--- a/examples/conversation.py
+++ /dev/null
@@ -1,341 +0,0 @@
-import argparse
-import multiprocessing as mp
-import os
-import re
-import sys
-import time
-from contextlib import contextmanager
-from pathlib import Path
-
-import numpy as np
-import pyaudio
-import yaml
-from llama import LLaMa
-from vits import MODELS as VITS_MODELS
-from vits import Y_LENGTH_ESTIMATE_SCALARS, HParams, Synthesizer, TextMapper, get_hparams_from_file, load_model
-from whisper import init_whisper, transcribe_waveform
-from sentencepiece import SentencePieceProcessor
-
-from tinygrad.helpers import Timing, fetch
-from tinygrad import Tensor, dtypes
-
-# Whisper constants
-RATE = 16000
-CHUNK = 1600
-
-# LLaMa constants
-IM_START = 32001
-IM_END = 32002
-
-
-# Functions for encoding prompts to chatml md
-def encode_prompt(spp, k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
-def start_prompt(spp, k): return [IM_START]+spp.encode(f"{k}\n")
-
-def chunks(lst, n):
-  for i in range(0, len(lst), n): yield lst[i:i + n]
-
-def create_fixed_tokenizer():
-  """Function needed for extending tokenizer with additional chat tokens"""
-  import extra.junk.sentencepiece_model_pb2 as spb2
-  tokenizer_path = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/tokenizer.model")
-  if SentencePieceProcessor(model_file=str(tokenizer_path)).vocab_size() != 32003:
-    print("creating fixed tokenizer")
-    mp = spb2.ModelProto()
-    mp.ParseFromString(tokenizer_path.read_bytes())
-    # https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/blob/main/added_tokens.json
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="[PAD]", score=0))
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
-    mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
-    tokenizer_path.write_bytes(mp.SerializeToString())
-  return tokenizer_path
-
-def llama_prepare(llama: LLaMa, temperature: float, pre_prompt_path: Path) -> tuple[list[int], str, str, str]:
-  """Prepares a llama model from a specified pre-prompt file"""
-  with open(str(pre_prompt_path)) as f:
-    config = yaml.safe_load(f.read())
-  toks = [llama.tokenizer.bos_id()] + encode_prompt(llama.tokenizer, "system", config["pre_prompt"].replace("\n", " "))
-  for i in config["examples"]:
-    toks += encode_prompt(llama.tokenizer, config["user_delim"], i["user_prompt"])
-    toks += encode_prompt(llama.tokenizer, config["resp_delim"], i["resp_prompt"])
-  llama.model(Tensor([toks]), 0, temperature).realize()  # NOTE: outputs are not used
-  return toks, config["user_delim"], config["resp_delim"], len(toks), llama.tokenizer.decode(toks)
-
-def llama_generate(
-  llama: LLaMa,
-  toks: list[int],
-  outputted: str,
-  prompt: str,
-  start_pos: int,
-  user_delim: str,
-  resp_delim: str,
-  temperature=0.7,
-  max_tokens=1000
-):
-  """Generates an output for the specified prompt"""
-  toks += encode_prompt(llama.tokenizer, user_delim, prompt)
-  toks += start_prompt(llama.tokenizer, resp_delim)
-
-  outputted = llama.tokenizer.decode(toks)
-  init_length = len(outputted)
-  for _ in range(max_tokens):
-    token = llama.model(Tensor([toks[start_pos:]]), start_pos, temperature).item()
-    start_pos = len(toks)
-    toks.append(token)
-
-    cur = llama.tokenizer.decode(toks)
-
-    # Print is just for debugging
-    sys.stdout.write(cur[len(outputted):])
-    sys.stdout.flush()
-    outputted = cur
-    if toks[-1] == IM_END: break
-  else:
-    toks.append(IM_END)
-  print() # because the output is flushed
-  return outputted, start_pos, outputted[init_length:].replace("<|im_end|>", "")
-
-def tts(
-  text_to_synthesize: str,
-  synth: Synthesizer,
-  hps: HParams,
-  emotion_embedding: Path,
-  speaker_id: int,
-  model_to_use: str,
-  noise_scale: float,
-  noise_scale_w: float,
-  length_scale: float,
-  estimate_max_y_length: bool,
-  text_mapper: TextMapper,
-  model_has_multiple_speakers: bool,
-  pad_length=600,
-  vits_pad_length=1000
-):
-  if model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
-
-  # Convert the input text to a tensor.
-  stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
-  init_shape = stn_tst.shape
-  assert init_shape[0] < pad_length, "text is too long"
-  x_tst, x_tst_lengths = stn_tst.pad(((0, pad_length - init_shape[0]),), value=1).unsqueeze(0), Tensor([init_shape[0]], dtype=dtypes.int64)
-  sid = Tensor([speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
-
-  # Perform inference.
-  audio_tensor = synth.infer(x_tst, x_tst_lengths, sid, noise_scale, length_scale, noise_scale_w, emotion_embedding=emotion_embedding,
-                             max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[model_to_use] if estimate_max_y_length else None, pad_length=vits_pad_length)[0, 0]
-  # Save the audio output.
-  audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
-  return audio_data
-
-def init_vits(
-  model_to_use: str,
-  emotion_path: Path,
-  speaker_id: int,
-  seed: int,
-):
-  model_config = VITS_MODELS[model_to_use]
-
-  # Load the hyperparameters from the config file.
-  hps = get_hparams_from_file(fetch(model_config[0]))
-
-  # If model has multiple speakers, validate speaker id and retrieve name if available.
-  model_has_multiple_speakers = hps.data.n_speakers > 0
-  if model_has_multiple_speakers:
-    if speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {speaker_id} is invalid for this model.")
-    if hps.__contains__("speakers"): # maps speaker ids to names
-      speakers = hps.speakers
-      if isinstance(speakers, list): speakers = {speaker: i for i, speaker in enumerate(speakers)}
-
-  # Load emotions if any. TODO: find an english model with emotions, this is untested atm.
-  emotion_embedding = None
-  if emotion_path is not None:
-    if emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(emotion_path), dtype=dtypes.int64).unsqueeze(0)
-    else: raise ValueError("Emotion path must be a .npy file.")
-
-  # Load symbols, instantiate TextMapper and clean the text.
-  if hps.__contains__("symbols"): symbols = hps.symbols
-  elif model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
-  else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ")
-  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
-
-  # Load the model.
-  if seed is not None:
-    Tensor.manual_seed(seed)
-    np.random.seed(seed)
-  net_g = load_model(text_mapper.symbols, hps, model_config)
-
-  return net_g, emotion_embedding, text_mapper, hps, model_has_multiple_speakers
-
-@contextmanager
-def output_stream(num_channels: int, sample_rate: int):
-  try:
-    p = pyaudio.PyAudio()
-    stream = p.open(format=pyaudio.paInt16, channels=num_channels, rate=sample_rate, output=True)
-    yield stream
-  except KeyboardInterrupt: pass
-  finally:
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-@contextmanager
-def log_writer():
-  try:
-    logs = []
-    yield logs
-  finally:
-    sep = "="*os.get_terminal_size()[1]
-    print(f"{sep[:-1]}\nCHAT LOG")
-    print(*logs, sep="\n")
-    print(sep)
-
-def listener(q: mp.Queue, event: mp.Event):
-  try:
-    p = pyaudio.PyAudio()
-    stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK)
-    did_print = False
-    while True:
-      data = stream.read(CHUNK) # read data to avoid overflow
-      if event.is_set():
-        if not did_print:
-          print("listening")
-          did_print = True
-        q.put(((np.frombuffer(data, np.int16)/32768).astype(np.float32)*3))
-      else:
-        did_print = False
-  finally:
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-def mp_output_stream(q: mp.Queue, counter: mp.Value, num_channels: int, sample_rate: int):
-  with output_stream(num_channels, sample_rate) as stream:
-    while True:
-      try:
-        stream.write(q.get())
-        counter.value += 1
-      except KeyboardInterrupt:
-        break
-
-if __name__ == "__main__":
-  import nltk
-  nltk.download("punkt")
-  # Parse CLI arguments
-  parser = argparse.ArgumentParser("Have a tiny conversation with tinygrad")
-
-  # Whisper args
-  parser.add_argument("--whisper_model_name", type=str, default="tiny.en")
-
-  # LLAMA args
-  parser.add_argument("--llama_pre_prompt_path", type=Path, default=Path(__file__).parent / "conversation_data" / "pre_prompt_stacy.yaml", help="Path to yaml file which contains all pre-prompt data needed. ")
-  parser.add_argument("--llama_count", type=int, default=1000, help="Max number of tokens to generate")
-  parser.add_argument("--llama_temperature", type=float, default=0.7, help="Temperature in the softmax")
-  parser.add_argument("--llama_quantize", type=str, default=None, help="Quantize the weights to int8 or nf4 in memory")
-  parser.add_argument("--llama_model", type=Path, default=None, help="Folder with the original weights to load, or single .index.json, .safetensors or .bin file")
-  parser.add_argument("--llama_gen", type=str, default="tiny", required=False, help="Generation of the model to use")
-  parser.add_argument("--llama_size", type=str, default="1B-Chat", required=False, help="Size of model to use")
-  parser.add_argument("--llama_tokenizer", type=Path, default=None, required=False, help="Path to llama tokenizer.model")
-
-  # vits args
-  parser.add_argument("--vits_model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
-  parser.add_argument("--vits_speaker_id", type=int, default=12, help="Specify the speaker ID. Default is 6.")
-  parser.add_argument("--vits_noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
-  parser.add_argument("--vits_noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
-  parser.add_argument("--vits_length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
-  parser.add_argument("--vits_seed", type=int, default=None, help="Specify the seed (set to None if no seed). Default is 1337.")
-  parser.add_argument("--vits_num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
-  parser.add_argument("--vits_sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
-  parser.add_argument("--vits_emotion_path", type=Path, default=None, help="Specify the path to emotion reference.")
-  parser.add_argument("--vits_estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
-  parser.add_argument("--vits_vocab_path", type=Path, default=None, help="Path to the TTS vocabulary.")
-
-  # conversation args
-  parser.add_argument("--max_sentence_length", type=int, default=20, help="Max words in one sentence to pass to vits")
-
-  args = parser.parse_args()
-
-  # Init models
-  model, enc = init_whisper(args.whisper_model_name)
-  synth, emotion_embedding, text_mapper, hps, model_has_multiple_speakers = init_vits(args.vits_model_to_use, args.vits_emotion_path, args.vits_speaker_id, args.vits_seed)
-
-  # Download tinyllama chat as a default model
-  if args.llama_model is None:
-    args.llama_model = fetch("https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4/resolve/main/model.safetensors", "tinyllamachat.safetensors")
-    args.llama_gen = "tiny"
-    args.llama_size = "1B-Chat"
-  # Add 3 more tokens to the tokenizer
-  if args.llama_gen == "tiny" and args.llama_size.endswith("Chat"): args.llama_tokenizer = create_fixed_tokenizer()
-  tokenizer_path = args.llama_tokenizer or args.llama_model.parent / "tokenizer.model"
-  llama = LLaMa.build(args.llama_model, tokenizer_path, args.llama_gen, args.llama_size, args.llama_quantize)
-  toks, user_delim, resp_delim, start_pos, outputted = llama_prepare(llama, args.llama_temperature, args.llama_pre_prompt_path)
-
-  # Start child process for mic input
-  q = mp.Queue()
-  is_listening_event = mp.Event()
-  p = mp.Process(target=listener, args=(q, is_listening_event,))
-  p.daemon = True
-  p.start()
-
-  # Start child process for speaker output
-  out_q = mp.Queue()
-  out_counter = mp.Value("i", 0)
-  out_p = mp.Process(target=mp_output_stream, args=(out_q, out_counter, args.vits_num_channels, hps.data.sampling_rate,))
-  out_p.daemon = True
-  out_p.start()
-
-  # JIT tts
-  for i in ["Hello, I'm a chat bot", "I am capable of doing a lot of things"]:
-    tts(
-      i, synth, hps, emotion_embedding,
-      args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
-      args.vits_noise_scale_w, args.vits_length_scale,
-      args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
-    )
-
-  # Start the pipeline
-  with log_writer() as log:
-    while True:
-      tokens = [enc._special_tokens["<|startoftranscript|>"], enc._special_tokens["<|notimestamps|>"]]
-      total = np.array([])
-      out_counter.value = 0
-
-      s = time.perf_counter()
-      is_listening_event.set()
-      prev_text = None
-      while True:
-        for _ in range(RATE // CHUNK): total = np.concatenate([total, q.get()])
-        txt = transcribe_waveform(model, enc, [total], truncate=True)
-        print(txt, end="\r")
-        if txt == "[BLANK_AUDIO]" or re.match(r"^\([\w+ ]+\)$", txt.strip()): continue
-        if prev_text is not None and prev_text == txt:
-          is_listening_event.clear()
-          break
-        prev_text = txt
-      print() # to avoid llama printing on the same line
-      log.append(f"{user_delim.capitalize()}: {txt}")
-
-      # Generate with llama
-      with Timing("llama generation: "):
-        outputted, start_pos, response = llama_generate(
-          llama, toks, outputted, txt, start_pos,
-          user_delim=user_delim, resp_delim=resp_delim, temperature=args.llama_temperature,
-          max_tokens=args.llama_count
-        )
-        log.append(f"{resp_delim.capitalize()}: {response}")
-
-      # Convert to voice
-      with Timing("tts: "):
-        sentences = nltk.sent_tokenize(response.replace('"', ""))
-        for i in sentences:
-          total = np.array([], dtype=np.int16)
-          for j in chunks(i.split(), args.max_sentence_length):
-            audio_data = tts(
-              " ".join(j), synth, hps, emotion_embedding,
-              args.vits_speaker_id, args.vits_model_to_use, args.vits_noise_scale,
-              args.vits_noise_scale_w, args.vits_length_scale,
-              args.vits_estimate_max_y_length, text_mapper, model_has_multiple_speakers
-            )
-            total = np.concatenate([total, audio_data])
-          out_q.put(total.tobytes())
-      while out_counter.value < len(sentences): continue
-      log.append(f"Total: {time.perf_counter() - s}")
diff --git a/examples/mlperf/model_eval.py b/examples/mlperf/model_eval.py
index 689e4d81a2..4a1c1f4e7c 100644
--- a/examples/mlperf/model_eval.py
+++ b/examples/mlperf/model_eval.py
@@ -204,43 +204,6 @@ def eval_bert():
 
     st = time.perf_counter()
 
-def eval_mrcnn():
-  from tqdm import tqdm
-  from extra.models.mask_rcnn import MaskRCNN
-  from extra.models.resnet import ResNet
-  from extra.datasets.coco import BASEDIR, images, convert_prediction_to_coco_bbox, convert_prediction_to_coco_mask, accumulate_predictions_for_coco, evaluate_predictions_on_coco, iterate
-  from examples.mask_rcnn import compute_prediction_batched, Image
-  mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True))
-  mdl.load_from_pretrained()
-
-  bbox_output = '/tmp/results_bbox.json'
-  mask_output = '/tmp/results_mask.json'
-
-  accumulate_predictions_for_coco([], bbox_output, rm=True)
-  accumulate_predictions_for_coco([], mask_output, rm=True)
-
-  #TODO: bs > 1 not as accurate
-  bs = 1
-
-  for batch in tqdm(iterate(images, bs=bs), total=len(images)//bs):
-    batch_imgs = []
-    for image_row in batch:
-      image_name = image_row['file_name']
-      img = Image.open(BASEDIR/f'val2017/{image_name}').convert("RGB")
-      batch_imgs.append(img)
-    batch_result = compute_prediction_batched(batch_imgs, mdl)
-    for image_row, result in zip(batch, batch_result):
-      image_name = image_row['file_name']
-      box_pred = convert_prediction_to_coco_bbox(image_name, result)
-      mask_pred = convert_prediction_to_coco_mask(image_name, result)
-      accumulate_predictions_for_coco(box_pred, bbox_output)
-      accumulate_predictions_for_coco(mask_pred, mask_output)
-    del batch_imgs
-    del batch_result
-
-  evaluate_predictions_on_coco(bbox_output, iou_type='bbox')
-  evaluate_predictions_on_coco(mask_output, iou_type='segm')
-
 def eval_llama3():
   from extra.models.llama import Transformer
   from examples.llama3 import MODEL_PARAMS, load, convert_from_huggingface
@@ -541,7 +504,7 @@ if __name__ == "__main__":
   # inference only
   Tensor.training = False
 
-  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert,mrcnn").split(",")
+  models = getenv("MODEL", "resnet,retinanet,unet3d,rnnt,bert").split(",")
   for m in models:
     nm = f"eval_{m}"
     if nm in globals():
diff --git a/examples/vits.py b/examples/vits.py
deleted file mode 100644
index b315a5253a..0000000000
--- a/examples/vits.py
+++ /dev/null
@@ -1,740 +0,0 @@
-import json, logging, math, re, sys, time, wave, argparse, numpy as np
-from phonemizer.phonemize import default_separator, _phonemize
-from phonemizer.backend import EspeakBackend
-from phonemizer.punctuation import Punctuation
-from functools import reduce
-from pathlib import Path
-from typing import List
-from tinygrad import nn, dtypes
-from tinygrad.helpers import fetch
-from tinygrad.nn.state import torch_load
-from tinygrad.tensor import Tensor
-from tinygrad.engine.jit import TinyJit
-from unidecode import unidecode
-
-LRELU_SLOPE = 0.1
-
-class Synthesizer:
-  def __init__(self, n_vocab, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, gin_channels=0, use_sdp=True, emotion_embedding=False, **kwargs):
-    self.n_vocab, self.spec_channels, self.inter_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.resblock, self.resblock_kernel_sizes, self.resblock_dilation_sizes, self.upsample_rates, self.upsample_initial_channel, self.upsample_kernel_sizes, self.segment_size, self.n_speakers, self.gin_channels, self.use_sdp = n_vocab, spec_channels, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, segment_size, n_speakers, gin_channels, use_sdp
-    self.enc_p = TextEncoder(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding)
-    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
-    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
-    self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) if use_sdp else DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
-    if n_speakers > 1: self.emb_g = nn.Embedding(n_speakers, gin_channels)
-  def infer(self, x, x_lengths, sid=None, noise_scale=1.0, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None, max_y_length_estimate_scale=None, pad_length=-1):
-    x, m_p, logs_p, x_mask = self.enc_p.forward(x.realize(), x_lengths.realize(), emotion_embedding.realize() if emotion_embedding is not None else emotion_embedding)
-    g = self.emb_g(sid.reshape(1, 1)).squeeze(1).unsqueeze(-1) if self.n_speakers > 0 else None
-    logw = self.dp.forward(x, x_mask.realize(), g=g.realize(), reverse=self.use_sdp, noise_scale=noise_scale_w if self.use_sdp else 1.0)
-    w_ceil = Tensor.ceil(logw.exp() * x_mask * length_scale)
-    y_lengths = Tensor.maximum(w_ceil.sum([1, 2]), 1).cast(dtypes.int64)
-    return self.generate(g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length)
-  def generate(self, g, logs_p, m_p, max_len, max_y_length_estimate_scale, noise_scale, w_ceil, x, x_mask, y_lengths, pad_length):
-    max_y_length = y_lengths.max().item() if max_y_length_estimate_scale is None else max(15, x.shape[-1]) * max_y_length_estimate_scale
-    y_mask = sequence_mask(y_lengths, max_y_length).unsqueeze(1).cast(x_mask.dtype)
-    attn_mask = x_mask.unsqueeze(2) * y_mask.unsqueeze(-1)
-    attn = generate_path(w_ceil, attn_mask)
-    m_p_2 = attn.squeeze(1).matmul(m_p.transpose(1, 2)).transpose(1, 2)        # [b, t', t], [b, t, d] -> [b, d, t']
-    logs_p_2 = attn.squeeze(1).matmul(logs_p.transpose(1, 2)).transpose(1, 2)  # [b, t', t], [b, t, d] -> [b, d, t']
-    z_p = m_p_2 + Tensor.randn(*m_p_2.shape, dtype=m_p_2.dtype) * logs_p_2.exp() * noise_scale
-    row_len = y_mask.shape[2]
-    if pad_length > -1:
-      # Pad flow forward inputs to enable JIT
-      assert pad_length > row_len, "pad length is too small"
-      y_mask = y_mask.pad(((0, 0), (0, 0), (0, pad_length - row_len))).cast(z_p.dtype)
-      # New y_mask tensor to remove sts mask
-      y_mask = Tensor(y_mask.numpy(), device=y_mask.device, dtype=y_mask.dtype, requires_grad=y_mask.requires_grad)
-      z_p = z_p.squeeze(0).pad(((0, 0), (0, pad_length - z_p.shape[2])), value=1).unsqueeze(0)
-    z = self.flow.forward(z_p.realize(), y_mask.realize(), g=g.realize(), reverse=True)
-    result_length = reduce(lambda x, y: x * y, self.dec.upsample_rates, row_len)
-    o = self.dec.forward((z * y_mask)[:, :, :max_len], g=g)[:, :, :result_length]
-    if max_y_length_estimate_scale is not None:
-      length_scaler = o.shape[-1] / max_y_length
-      o.realize()
-      real_max_y_length = y_lengths.max().numpy()
-      if real_max_y_length > max_y_length:
-        logging.warning(f"Underestimated max length by {(((real_max_y_length / max_y_length) * 100) - 100):.2f}%, recomputing inference without estimate...")
-        return self.generate(g, logs_p, m_p, max_len, None, noise_scale, w_ceil, x, x_mask, y_lengths)
-      if real_max_y_length < max_y_length:
-        overestimation = ((max_y_length / real_max_y_length) * 100) - 100
-        logging.info(f"Overestimated max length by {overestimation:.2f}%")
-        if overestimation > 10: logging.warning("Warning: max length overestimated by more than 10%")
-      o = o[:, :, :(real_max_y_length * length_scaler).astype(np.int32)]
-    return o
-
-class StochasticDurationPredictor:
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
-    filter_channels = in_channels # it needs to be removed from future version.
-    self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.n_flows, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, n_flows, gin_channels
-    self.log_flow, self.flows = Log(), [ElementwiseAffine(2)]
-    for _ in range(n_flows):
-      self.flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.flows.append(Flip())
-    self.post_pre, self.post_proj = nn.Conv1d(1, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
-    self.post_convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    self.post_flows = [ElementwiseAffine(2)]
-    for _ in range(4):
-      self.post_flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.post_flows.append(Flip())
-    self.pre, self.proj = nn.Conv1d(in_channels, filter_channels, 1), nn.Conv1d(filter_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-  @TinyJit
-  def forward(self, x: Tensor, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
-    x = self.pre(x.detach())
-    if g is not None: x = x + self.cond(g.detach())
-    x = self.convs.forward(x, x_mask)
-    x = self.proj(x) * x_mask
-    if not reverse:
-      flows = self.flows
-      assert w is not None
-      log_det_tot_q = 0
-      h_w = self.post_proj(self.post_convs.forward(self.post_pre(w), x_mask)) * x_mask
-      e_q = Tensor.randn(w.size(0), 2, w.size(2), dtype=x.dtype).to(device=x.device) * x_mask
-      z_q = e_q
-      for flow in self.post_flows:
-        z_q, log_det_q = flow.forward(z_q, x_mask, g=(x + h_w))
-        log_det_tot_q += log_det_q
-      z_u, z1 = z_q.split([1, 1], 1)
-      u = z_u.sigmoid() * x_mask
-      z0 = (w - u) * x_mask
-      log_det_tot_q += Tensor.sum((z_u.logsigmoid() + (-z_u).logsigmoid()) * x_mask, [1,2])
-      log_q = Tensor.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - log_det_tot_q
-      log_det_tot = 0
-      z0, log_det = self.log_flow.forward(z0, x_mask)
-      log_det_tot += log_det
-      z = z0.cat(z1, 1)
-      for flow in flows:
-        z, log_det = flow.forward(z, x_mask, g=x, reverse=reverse)
-        log_det_tot = log_det_tot + log_det
-      nll = Tensor.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - log_det_tot
-      return (nll + log_q).realize() # [b]
-    flows = list(reversed(self.flows))
-    flows = flows[:-2] + [flows[-1]] # remove a useless vflow
-    z = Tensor.randn(x.shape[0], 2, x.shape[2], dtype=x.dtype).to(device=x.device) * noise_scale
-    for flow in flows: z = flow.forward(z, x_mask, g=x, reverse=reverse)
-    z0, z1 = z.split([1, 1], 1)
-    return z0.realize()
-
-class DurationPredictor:
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
-    self.in_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.gin_channels = in_channels, filter_channels, kernel_size, p_dropout, gin_channels
-    self.conv_1, self.norm_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
-    self.conv_2, self.norm_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2), LayerNorm(filter_channels)
-    self.proj = nn.Conv1d(filter_channels, 1, 1)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-  def forward(self, x: Tensor, x_mask, g=None):
-    x = x.detach()
-    if g is not None: x = x + self.cond(g.detach())
-    x = self.conv_1(x * x_mask).relu()
-    x = self.norm_1(x).dropout(self.p_dropout)
-    x = self.conv_2(x * x_mask).relu(x)
-    x = self.norm_2(x).dropout(self.p_dropout)
-    return self.proj(x * x_mask) * x_mask
-
-class TextEncoder:
-  def __init__(self, n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, emotion_embedding):
-    self.n_vocab, self.out_channels, self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout = n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-    if n_vocab!=0:self.emb = nn.Embedding(n_vocab, hidden_channels)
-    if emotion_embedding: self.emo_proj = nn.Linear(1024, hidden_channels)
-    self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
-    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-  @TinyJit
-  def forward(self, x: Tensor, x_lengths: Tensor, emotion_embedding=None):
-    if self.n_vocab!=0: x = (self.emb(x) * math.sqrt(self.hidden_channels))
-    if emotion_embedding: x = x + self.emo_proj(emotion_embedding).unsqueeze(1)
-    x = x.transpose(1, -1)  # [b, t, h] -transpose-> [b, h, t]
-    x_mask = sequence_mask(x_lengths, x.shape[2]).unsqueeze(1).cast(x.dtype)
-    x = self.encoder.forward(x * x_mask, x_mask)
-    m, logs = (self.proj(x) * x_mask).split(self.out_channels, dim=1)
-    return x.realize(), m.realize(), logs.realize(), x_mask.realize()
-
-class ResidualCouplingBlock:
-  def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
-    self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.n_flows, self.gin_channels = channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows, gin_channels
-    self.flows = []
-    for _ in range(n_flows):
-      self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
-      self.flows.append(Flip())
-  @TinyJit
-  def forward(self, x, x_mask, g=None, reverse=False):
-    for flow in reversed(self.flows) if reverse else self.flows: x = flow.forward(x, x_mask, g=g, reverse=reverse)
-    return x.realize()
-
-class PosteriorEncoder:
-  def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
-    self.in_channels, self.out_channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels = in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels
-    self.pre, self.proj = nn.Conv1d(in_channels, hidden_channels, 1), nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
-  def forward(self, x, x_lengths, g=None):
-    x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).cast(x.dtype)
-    stats = self.proj(self.enc.forward(self.pre(x) * x_mask, x_mask, g=g)) * x_mask
-    m, logs = stats.split(self.out_channels, dim=1)
-    z = (m + Tensor.randn(m.shape, m.dtype) * logs.exp()) * x_mask
-    return z, m, logs, x_mask
-
-class Generator:
-  def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
-    self.num_kernels, self.num_upsamples = len(resblock_kernel_sizes), len(upsample_rates)
-    self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-    resblock = ResBlock1 if resblock == '1' else ResBlock2
-    self.ups = [nn.ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), k, u, padding=(k-u)//2) for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes))]
-    self.resblocks = []
-    self.upsample_rates = upsample_rates
-    for i in range(len(self.ups)):
-      ch = upsample_initial_channel // (2 ** (i + 1))
-      for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-        self.resblocks.append(resblock(ch, k, d))
-    self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-    if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-  @TinyJit
-  def forward(self, x: Tensor, g=None):
-    x = self.conv_pre(x)
-    if g is not None:  x = x + self.cond(g)
-    for i in range(self.num_upsamples):
-      x = self.ups[i](x.leaky_relu(LRELU_SLOPE))
-      xs = sum(self.resblocks[i * self.num_kernels + j].forward(x) for j in range(self.num_kernels))
-      x = (xs / self.num_kernels).realize()
-    res = self.conv_post(x.leaky_relu()).tanh().realize()
-    return res
-
-class LayerNorm(nn.LayerNorm):
-  def __init__(self, channels, eps=1e-5): super().__init__(channels, eps, elementwise_affine=True)
-  def forward(self, x: Tensor): return self.__call__(x.transpose(1, -1)).transpose(1, -1)
-
-class WN:
-  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-    assert (kernel_size % 2 == 1)
-    self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.gin_channels, self.p_dropout = hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels, p_dropout
-    self.in_layers, self.res_skip_layers = [], []
-    if gin_channels != 0: self.cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-    for i in range(n_layers):
-      dilation = dilation_rate ** i
-      self.in_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=int((kernel_size * dilation - dilation) / 2)))
-      self.res_skip_layers.append(nn.Conv1d(hidden_channels, 2 * hidden_channels if i < n_layers - 1 else hidden_channels, 1))
-  def forward(self, x, x_mask, g=None, **kwargs):
-    output = Tensor.zeros_like(x)
-    if g is not None: g = self.cond_layer(g)
-    for i in range(self.n_layers):
-      x_in = self.in_layers[i](x)
-      if g is not None:
-        cond_offset = i * 2 * self.hidden_channels
-        g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
-      else:
-        g_l = Tensor.zeros_like(x_in)
-      acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.hidden_channels)
-      res_skip_acts = self.res_skip_layers[i](acts)
-      if i < self.n_layers - 1:
-        x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
-        output = output + res_skip_acts[:, self.hidden_channels:, :]
-      else:
-        output = output + res_skip_acts
-    return output * x_mask
-
-class ResBlock1:
-  def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-    self.convs1 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(3)]
-    self.convs2 = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)) for _ in range(3)]
-  def forward(self, x: Tensor, x_mask=None):
-    for c1, c2 in zip(self.convs1, self.convs2):
-      xt = x.leaky_relu(LRELU_SLOPE)
-      xt = c1(xt if x_mask is None else xt * x_mask).leaky_relu(LRELU_SLOPE)
-      x = c2(xt if x_mask is None else xt * x_mask) + x
-    return x if x_mask is None else x * x_mask
-
-class ResBlock2:
-  def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-    self.convs = [nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation[i], padding=get_padding(kernel_size, dilation[i])) for i in range(2)]
-  def forward(self, x, x_mask=None):
-    for c in self.convs:
-      xt = x.leaky_relu(LRELU_SLOPE)
-      xt = c(xt if x_mask is None else xt * x_mask)
-      x = xt + x
-    return x if x_mask is None else x * x_mask
-
-class DDSConv: # Dilated and Depth-Separable Convolution
-  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
-    self.channels, self.kernel_size, self.n_layers, self.p_dropout = channels, kernel_size, n_layers, p_dropout
-    self.convs_sep, self.convs_1x1, self.norms_1, self.norms_2 = [], [], [], []
-    for i in range(n_layers):
-      dilation = kernel_size ** i
-      padding = (kernel_size * dilation - dilation) // 2
-      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding))
-      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
-      self.norms_1.append(LayerNorm(channels))
-      self.norms_2.append(LayerNorm(channels))
-  def forward(self, x, x_mask, g=None):
-    if g is not None: x = x + g
-    for i in range(self.n_layers):
-      y = self.convs_sep[i](x * x_mask)
-      y = self.norms_1[i].forward(y).gelu()
-      y = self.convs_1x1[i](y)
-      y = self.norms_2[i].forward(y).gelu()
-      x = x + y.dropout(self.p_dropout)
-    return x * x_mask
-
-class ConvFlow:
-  def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
-    self.in_channels, self.filter_channels, self.kernel_size, self.n_layers, self.num_bins, self.tail_bound = in_channels, filter_channels, kernel_size, n_layers, num_bins, tail_bound
-    self.half_channels = in_channels // 2
-    self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
-    self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = x.split([self.half_channels] * 2, 1)
-    h = self.proj(self.convs.forward(self.pre(x0), x_mask, g=g)) * x_mask
-    b, c, t = x0.shape
-    h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
-    un_normalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
-    un_normalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
-    un_normalized_derivatives = h[..., 2 * self.num_bins:]
-    x1, log_abs_det = piecewise_rational_quadratic_transform(x1, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=reverse, tails='linear', tail_bound=self.tail_bound)
-    x = x0.cat(x1, dim=1) * x_mask
-    return x if reverse else (x, Tensor.sum(log_abs_det * x_mask, [1,2]))
-
-class ResidualCouplingLayer:
-  def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
-    assert channels % 2 == 0, "channels should be divisible by 2"
-    self.channels, self.hidden_channels, self.kernel_size, self.dilation_rate, self.n_layers, self.mean_only = channels, hidden_channels, kernel_size, dilation_rate, n_layers, mean_only
-    self.half_channels = channels // 2
-    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
-    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = x.split([self.half_channels] * 2, 1)
-    stats = self.post(self.enc.forward(self.pre(x0) * x_mask, x_mask, g=g)) * x_mask
-    if not self.mean_only:
-      m, logs = stats.split([self.half_channels] * 2, 1)
-    else:
-      m = stats
-      logs = Tensor.zeros_like(m)
-    if not reverse: return x0.cat((m + x1 * logs.exp() * x_mask), dim=1)
-    return x0.cat(((x1 - m) * (-logs).exp() * x_mask), dim=1)
-
-class Log:
-  def forward(self, x : Tensor, x_mask, reverse=False):
-    if not reverse:
-      y = x.maximum(1e-5).log() * x_mask
-      return y, (-y).sum([1, 2])
-    return x.exp() * x_mask
-
-class Flip:
-  def forward(self, x: Tensor, *args, reverse=False, **kwargs):
-    return x.flip([1]) if reverse else (x.flip([1]), Tensor.zeros(x.shape[0], dtype=x.dtype).to(device=x.device))
-
-class ElementwiseAffine:
-  def __init__(self, channels): self.m, self.logs = Tensor.zeros(channels, 1), Tensor.zeros(channels, 1)
-  def forward(self, x, x_mask, reverse=False, **kwargs): # x if reverse else y, logdet
-    return (x - self.m) * Tensor.exp(-self.logs) * x_mask if reverse \
-      else ((self.m + Tensor.exp(self.logs) * x) * x_mask, Tensor.sum(self.logs * x_mask, [1, 2]))
-
-class MultiHeadAttention:
-  def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
-    assert channels % n_heads == 0
-    self.channels, self.out_channels, self.n_heads, self.p_dropout, self.window_size, self.heads_share, self.block_length, self.proximal_bias, self.proximal_init = channels, out_channels, n_heads, p_dropout, window_size, heads_share, block_length, proximal_bias, proximal_init
-    self.attn, self.k_channels  = None, channels // n_heads
-    self.conv_q, self.conv_k, self.conv_v = [nn.Conv1d(channels, channels, 1) for _ in range(3)]
-    self.conv_o = nn.Conv1d(channels, out_channels, 1)
-    if window_size is not None: self.emb_rel_k, self.emb_rel_v = [Tensor.randn(1 if heads_share else n_heads, window_size * 2 + 1, self.k_channels) * (self.k_channels ** -0.5) for _ in range(2)]
-  def forward(self, x, c, attn_mask=None):
-    q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
-    x, self.attn = self.attention(q, k, v, mask=attn_mask)
-    return self.conv_o(x)
-  def attention(self, query: Tensor, key: Tensor, value: Tensor, mask=None):# reshape [b, d, t] -> [b, n_h, t, d_k]
-    b, d, t_s, t_t = key.shape[0], key.shape[1], key.shape[2], query.shape[2]
-    query = query.reshape(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-    key = key.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    value = value.reshape(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    scores = (query / math.sqrt(self.k_channels)) @ key.transpose(-2, -1)
-    if self.window_size is not None:
-      assert t_s == t_t, "Relative attention is only available for self-attention."
-      key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-      rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
-      scores = scores + self._relative_position_to_absolute_position(rel_logits)
-    if mask is not None:
-      scores = Tensor.where(mask, scores, -1e4)
-      if self.block_length is not None:
-        assert t_s == t_t, "Local attention is only available for self-attention."
-        scores = Tensor.where(Tensor.ones_like(scores).triu(-self.block_length).tril(self.block_length), scores, -1e4)
-    p_attn = scores.softmax(axis=-1)  # [b, n_h, t_t, t_s]
-    output = p_attn.matmul(value)
-    if self.window_size is not None:
-      relative_weights = self._absolute_position_to_relative_position(p_attn)
-      value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
-      output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-    output = output.transpose(2, 3).contiguous().reshape(b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-    return output, p_attn
-  def _matmul_with_relative_values(self, x, y): return x.matmul(y.unsqueeze(0))                 # x: [b, h, l, m], y: [h or 1, m, d], ret: [b, h, l, d]
-  def _matmul_with_relative_keys(self, x, y): return x.matmul(y.unsqueeze(0).transpose(-2, -1)) # x: [b, h, l, d], y: [h or 1, m, d], re, : [b, h, l, m]
-  def _get_relative_embeddings(self, relative_embeddings, length):
-    pad_length, slice_start_position = max(length - (self.window_size + 1), 0), max((self.window_size + 1) - length, 0)
-    padded_relative_embeddings = relative_embeddings if pad_length <= 0\
-      else relative_embeddings.pad(convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-    return padded_relative_embeddings[:, slice_start_position:(slice_start_position + 2 * length - 1)] #used_relative_embeddings
-  def _relative_position_to_absolute_position(self, x: Tensor): # x: [b, h, l, 2*l-1] -> [b, h, l, l]
-    batch, heads, length, _ = x.shape
-    x = x.pad(convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
-    x_flat = x.reshape([batch, heads, length * 2 * length]).pad(convert_pad_shape([[0,0],[0,0],[0,length-1]]))
-    return x_flat.reshape([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
-  def _absolute_position_to_relative_position(self, x: Tensor): # x: [b, h, l, l] -> [b, h, l, 2*l-1]
-    batch, heads, length, _ = x.shape
-    x = x.pad(convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
-    x_flat = x.reshape([batch, heads, length**2 + length*(length -1)]).pad(convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-    return x_flat.reshape([batch, heads, length, 2*length])[:,:,:,1:]
-
-class FFN:
-  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
-    self.in_channels, self.out_channels, self.filter_channels, self.kernel_size, self.p_dropout, self.activation, self.causal = in_channels, out_channels, filter_channels, kernel_size, p_dropout, activation, causal
-    self.padding = self._causal_padding if causal else self._same_padding
-    self.conv_1, self.conv_2 = nn.Conv1d(in_channels, filter_channels, kernel_size), nn.Conv1d(filter_channels, out_channels, kernel_size)
-  def forward(self, x, x_mask):
-    x = self.conv_1(self.padding(x * x_mask))
-    x = x * (1.702 * x).sigmoid() if self.activation == "gelu" else x.relu()
-    return self.conv_2(self.padding(x.dropout(self.p_dropout) * x_mask)) * x_mask
-  def _causal_padding(self, x):return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [self.kernel_size - 1, 0]]))
-  def _same_padding(self, x): return x if self.kernel_size == 1 else x.pad(convert_pad_shape([[0, 0], [0, 0], [(self.kernel_size - 1) // 2, self.kernel_size // 2]]))
-
-class Encoder:
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
-    self.hidden_channels, self.filter_channels, self.n_heads, self.n_layers, self.kernel_size, self.p_dropout, self.window_size = hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, window_size
-    self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 = [], [], [], []
-    for _ in range(n_layers):
-      self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask):
-    attn_mask, x = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1), x * x_mask
-    for i in range(self.n_layers):
-      y = self.attn_layers[i].forward(x, x, attn_mask).dropout(self.p_dropout)
-      x = self.norm_layers_1[i].forward(x + y)
-      y = self.ffn_layers[i].forward(x, x_mask).dropout(self.p_dropout)
-      x = self.norm_layers_2[i].forward(x + y)
-    return x * x_mask
-
-DEFAULT_MIN_BIN_WIDTH, DEFAULT_MIN_BIN_HEIGHT, DEFAULT_MIN_DERIVATIVE = 1e-3, 1e-3, 1e-3
-def piecewise_rational_quadratic_transform(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails=None, tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  if tails is None: spline_fn, spline_kwargs = rational_quadratic_spline, {}
-  else: spline_fn, spline_kwargs = unconstrained_rational_quadratic_spline, {'tails': tails, 'tail_bound': tail_bound}
-  return spline_fn(inputs=inputs, un_normalized_widths=un_normalized_widths, un_normalized_heights=un_normalized_heights, un_normalized_derivatives=un_normalized_derivatives, inverse=inverse, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative, **spline_kwargs)
-def unconstrained_rational_quadratic_spline(inputs, un_normalized_widths, un_normalized_heights, un_normalized_derivatives, inverse=False, tails='linear', tail_bound=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  if not tails == 'linear': raise RuntimeError('{} tails are not implemented.'.format(tails))
-  constant = np.log(np.exp(1 - min_derivative) - 1).item()
-  un_normalized_derivatives = cat_lr(un_normalized_derivatives, constant, constant)
-  output, log_abs_det = rational_quadratic_spline(inputs=inputs.squeeze(dim=0).squeeze(dim=0), unnormalized_widths=un_normalized_widths.squeeze(dim=0).squeeze(dim=0), unnormalized_heights=un_normalized_heights.squeeze(dim=0).squeeze(dim=0), unnormalized_derivatives=un_normalized_derivatives.squeeze(dim=0).squeeze(dim=0), inverse=inverse, left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative)
-  return output.unsqueeze(dim=0).unsqueeze(dim=0), log_abs_det.unsqueeze(dim=0).unsqueeze(dim=0)
-def rational_quadratic_spline(inputs: Tensor, unnormalized_widths: Tensor, unnormalized_heights: Tensor, unnormalized_derivatives: Tensor, inverse=False, left=0., right=1., bottom=0., top=1., min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE):
-  num_bins = unnormalized_widths.shape[-1]
-  if min_bin_width * num_bins > 1.0: raise ValueError('Minimal bin width too large for the number of bins')
-  if min_bin_height * num_bins > 1.0: raise ValueError('Minimal bin height too large for the number of bins')
-  widths = min_bin_width + (1 - min_bin_width * num_bins) * unnormalized_widths.softmax(axis=-1)
-  cum_widths = cat_lr(((right - left) * widths[..., :-1].cumsum(axis=1) + left), left, right + 1e-6 if not inverse else right)
-  widths = cum_widths[..., 1:] - cum_widths[..., :-1]
-  derivatives = min_derivative + (unnormalized_derivatives.exp()+1).log()
-  heights = min_bin_height + (1 - min_bin_height * num_bins) * unnormalized_heights.softmax(axis=-1)
-  cum_heights = cat_lr(((top - bottom) * heights[..., :-1].cumsum(axis=1) + bottom), bottom, top + 1e-6 if inverse else top)
-  heights = cum_heights[..., 1:] - cum_heights[..., :-1]
-  bin_idx = ((inputs[..., None] >= (cum_heights if inverse else cum_widths)).sum(axis=-1) - 1)[..., None]
-  input_cum_widths = gather(cum_widths, bin_idx, axis=-1)[..., 0]
-  input_bin_widths = gather(widths, bin_idx, axis=-1)[..., 0]
-  input_cum_heights = gather(cum_heights, bin_idx, axis=-1)[..., 0]
-  input_delta = gather(heights / widths, bin_idx, axis=-1)[..., 0]
-  input_derivatives = gather(derivatives, bin_idx, axis=-1)[..., 0]
-  input_derivatives_plus_one = gather(derivatives[..., 1:], bin_idx, axis=-1)[..., 0]
-  input_heights = gather(heights, bin_idx, axis=-1)[..., 0]
-  if inverse:
-    a = ((inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + input_heights * (input_delta - input_derivatives))
-    b = (input_heights * input_derivatives - (inputs - input_cum_heights) * (input_derivatives + input_derivatives_plus_one - 2 * input_delta))
-    c = - input_delta * (inputs - input_cum_heights)
-    discriminant = b.square() - 4 * a * c
-    # assert (discriminant.numpy() >= 0).all()
-    root = (2 * c) / (-b - discriminant.sqrt())
-    theta_one_minus_theta = root * (1 - root)
-    denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
-    derivative_numerator = input_delta.square() * (input_derivatives_plus_one * root.square() + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - root).square())
-    return root * input_bin_widths + input_cum_widths, -(derivative_numerator.log() - 2 * denominator.log())
-  theta = (inputs - input_cum_widths) / input_bin_widths
-  theta_one_minus_theta = theta * (1 - theta)
-  numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
-  denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta)
-  derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - theta).pow(2))
-  return input_cum_heights + numerator / denominator, derivative_numerator.log() - 2 * denominator.log()
-
-def sequence_mask(length: Tensor, max_length): return Tensor.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1)
-def generate_path(duration: Tensor, mask: Tensor):  # duration: [b, 1, t_x], mask: [b, 1, t_y, t_x]
-  b, _, t_y, t_x = mask.shape
-  path = sequence_mask(duration.cumsum(axis=2).reshape(b * t_x), t_y).cast(mask.dtype).reshape(b, t_x, t_y)
-  path = path - path.pad(convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-  return path.unsqueeze(1).transpose(2, 3) * mask
-def fused_add_tanh_sigmoid_multiply(input_a: Tensor, input_b: Tensor, n_channels: int):
-  n_channels_int, in_act = n_channels, input_a + input_b
-  t_act, s_act = in_act[:, :n_channels_int, :].tanh(), in_act[:, n_channels_int:, :].sigmoid()
-  return t_act * s_act
-
-def cat_lr(t, left, right): return Tensor.full(get_shape(t), left).cat(t, dim=-1).cat(Tensor.full(get_shape(t), right), dim=-1)
-def get_shape(tensor):
-  (shape := list(tensor.shape))[-1] = 1
-  return tuple(shape)
-def convert_pad_shape(pad_shape): return tuple(tuple(x) for x in pad_shape)
-def get_padding(kernel_size, dilation=1): return int((kernel_size*dilation - dilation)/2)
-
-def gather(x, indices, axis):
-  indices = (indices < 0).where(indices + x.shape[axis], indices).transpose(0, axis)
-  permute_args = list(range(x.ndim))
-  permute_args[0], permute_args[axis] = permute_args[axis], permute_args[0]
-  permute_args.append(permute_args.pop(0))
-  x = x.permute(*permute_args)
-  reshape_arg = [1] * x.ndim + [x.shape[-1]]
-  return ((indices.unsqueeze(indices.ndim).expand(*indices.shape, x.shape[-1]) ==
-           Tensor.arange(x.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, x.shape[-1])) * x).sum(indices.ndim).transpose(0, axis)
-
-def norm_except_dim(v, dim):
-  if dim == -1: return np.linalg.norm(v)
-  if dim == 0:
-    (output_shape := [1] * v.ndim)[0] = v.shape[0]
-    return np.linalg.norm(v.reshape(v.shape[0], -1), axis=1).reshape(output_shape)
-  if dim == v.ndim - 1:
-    (output_shape := [1] * v.ndim)[-1] = v.shape[-1]
-    return np.linalg.norm(v.reshape(-1, v.shape[-1]), axis=0).reshape(output_shape)
-  transposed_v = np.transpose(v, (dim,) + tuple(i for i in range(v.ndim) if i != dim))
-  return np.transpose(norm_except_dim(transposed_v, 0), (dim,) + tuple(i for i in range(v.ndim) if i != dim))
-def weight_norm(v: Tensor, g: Tensor, dim):
-  v, g = v.numpy(), g.numpy()
-  return Tensor(v * (g / norm_except_dim(v, dim)))
-
-# HPARAMS LOADING
-def get_hparams_from_file(path):
-  with open(path, "r") as f:
-    data = f.read()
-  return HParams(**json.loads(data))
-class HParams:
-  def __init__(self, **kwargs):
-    for k, v in kwargs.items(): self[k] = v if type(v) != dict else HParams(**v)
-  def keys(self): return self.__dict__.keys()
-  def items(self): return self.__dict__.items()
-  def values(self): return self.__dict__.values()
-  def __len__(self): return len(self.__dict__)
-  def __getitem__(self, key): return getattr(self, key)
-  def __setitem__(self, key, value): return setattr(self, key, value)
-  def __contains__(self, key): return key in self.__dict__
-  def __repr__(self): return self.__dict__.__repr__()
-
-# MODEL LOADING
-def load_model(symbols, hps, model) -> Synthesizer:
-  net_g = Synthesizer(len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers = hps.data.n_speakers, **hps.model)
-  _ = load_checkpoint(fetch(model[1]), net_g, None)
-  return net_g
-def load_checkpoint(checkpoint_path, model: Synthesizer, optimizer=None, skip_list=[]):
-  assert Path(checkpoint_path).is_file()
-  start_time = time.time()
-  checkpoint_dict = torch_load(checkpoint_path)
-  iteration, learning_rate = checkpoint_dict['iteration'], checkpoint_dict['learning_rate']
-  if optimizer: optimizer.load_state_dict(checkpoint_dict['optimizer'])
-  saved_state_dict = checkpoint_dict['model']
-  weight_g, weight_v, parent = None, None, None
-  for key, v in saved_state_dict.items():
-    if any(layer in key for layer in skip_list): continue
-    try:
-      obj, skip = model, False
-      for k in key.split('.'):
-        if k.isnumeric(): obj = obj[int(k)]
-        elif isinstance(obj, dict): obj = obj[k]
-        else:
-          if isinstance(obj, (LayerNorm, nn.LayerNorm)) and k in ["gamma", "beta"]:
-            k = "weight" if k == "gamma" else "bias"
-          elif k in ["weight_g", "weight_v"]:
-            parent, skip = obj, True
-            if k == "weight_g": weight_g = v
-            else: weight_v = v
-          if not skip: obj = getattr(obj, k)
-      if weight_g is not None and weight_v is not None:
-        setattr(obj, "weight_g", weight_g.numpy())
-        setattr(obj, "weight_v", weight_v.numpy())
-        obj, v = getattr(parent, "weight"), weight_norm(weight_v, weight_g, 0)
-        weight_g, weight_v, parent, skip = None, None, None, False
-      if not skip and obj.shape == v.shape: obj.assign(v.to(obj.device))
-      elif not skip: logging.error(f"MISMATCH SHAPE IN {key}, {obj.shape} {v.shape}")
-    except Exception as e: raise e
-  logging.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration}) in {time.time() - start_time:.4f}s")
-  return model, optimizer, learning_rate, iteration
-
-# Used for cleaning input text and mapping to symbols
-class TextMapper: # Based on https://github.com/keithito/tacotron
-  def __init__(self, symbols, apply_cleaners=True):
-    self.apply_cleaners, self.symbols, self._inflect = apply_cleaners, symbols, None
-    self._symbol_to_id, _id_to_symbol = {s: i for i, s in enumerate(symbols)}, {i: s for i, s in enumerate(symbols)}
-    self._whitespace_re, self._abbreviations = re.compile(r'\s+'), [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [('mrs', 'misess'), ('mr', 'mister'), ('dr', 'doctor'), ('st', 'saint'), ('co', 'company'), ('jr', 'junior'), ('maj', 'major'), ('gen', 'general'), ('drs', 'doctors'), ('rev', 'reverend'), ('lt', 'lieutenant'), ('hon', 'honorable'), ('sgt', 'sergeant'), ('capt', 'captain'), ('esq', 'esquire'), ('ltd', 'limited'), ('col', 'colonel'), ('ft', 'fort'), ]]
-    self.phonemizer = EspeakBackend(
-        language="en-us", punctuation_marks=Punctuation.default_marks(), preserve_punctuation=True, with_stress=True,
-    )
-  def text_to_sequence(self, text, cleaner_names):
-    if self.apply_cleaners:
-      for name in cleaner_names:
-        cleaner = getattr(self, name)
-        if not cleaner: raise ModuleNotFoundError('Unknown cleaner: %s' % name)
-        text = cleaner(text)
-    else: text = text.strip()
-    return [self._symbol_to_id[symbol] for symbol in text]
-  def get_text(self, text, add_blank=False, cleaners=('english_cleaners2',)):
-    text_norm = self.text_to_sequence(text, cleaners)
-    return Tensor(self.intersperse(text_norm, 0) if add_blank else text_norm, dtype=dtypes.int64)
-  def intersperse(self, lst, item):
-    (result := [item] * (len(lst) * 2 + 1))[1::2] = lst
-    return result
-  def phonemize(self, text, strip=True): return _phonemize(self.phonemizer, text, default_separator, strip, 1, False, False)
-  def filter_oov(self, text): return "".join(list(filter(lambda x: x in self._symbol_to_id, text)))
-  def base_english_cleaners(self, text): return self.collapse_whitespace(self.phonemize(self.expand_abbreviations(unidecode(text.lower()))))
-  def english_cleaners2(self, text): return self.base_english_cleaners(text)
-  def transliteration_cleaners(self, text): return self.collapse_whitespace(unidecode(text.lower()))
-  def cjke_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text).replace('ɑ', 'a').replace('ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')))
-  def cjke_cleaners2(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_ipa2(text)))
-  def cjks_cleaners(self, text): return re.sub(r'([^\.,!\?\-…~])$', r'\1.', re.sub(r'\s+$', '', self.english_to_lazy_ipa(text)))
-  def english_to_ipa2(self, text):
-    _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ ('r', 'ɹ'), ('ʤ', 'dʒ'), ('ʧ', 'tʃ')]]
-    return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _ipa_to_ipa2, self.mark_dark_l(self.english_to_ipa(text))).replace('...', '…')
-  def mark_dark_l(self, text): return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ' + x.group(1), text)
-  def english_to_ipa(self, text):
-    import eng_to_ipa as ipa
-    return self.collapse_whitespace(ipa.convert(self.normalize_numbers(self.expand_abbreviations(unidecode(text).lower()))))
-  def english_to_lazy_ipa(self, text):
-    _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [('r', 'ɹ'), ('æ', 'e'), ('ɑ', 'a'), ('ɔ', 'o'), ('ð', 'z'), ('θ', 's'), ('ɛ', 'e'), ('ɪ', 'i'), ('ʊ', 'u'), ('ʒ', 'ʥ'), ('ʤ', 'ʥ'), ('ˈ', '↓')]]
-    return reduce(lambda t, rx: re.sub(rx[0], rx[1], t), _lazy_ipa, self.english_to_ipa(text))
-  def expand_abbreviations(self, text): return reduce(lambda t, abbr: re.sub(abbr[0], abbr[1], t), self._abbreviations, text)
-  def collapse_whitespace(self, text): return re.sub(self._whitespace_re, ' ', text)
-  def normalize_numbers(self, text):
-    import inflect
-    self._inflect = inflect.engine()
-    text = re.sub(re.compile(r'([0-9][0-9\,]+[0-9])'), self._remove_commas, text)
-    text = re.sub(re.compile(r'£([0-9\,]*[0-9]+)'), r'\1 pounds', text)
-    text = re.sub(re.compile(r'\$([0-9\.\,]*[0-9]+)'), self._expand_dollars, text)
-    text = re.sub(re.compile(r'([0-9]+\.[0-9]+)'), self._expand_decimal_point, text)
-    text = re.sub(re.compile(r'[0-9]+(st|nd|rd|th)'), self._expand_ordinal, text)
-    text = re.sub(re.compile(r'[0-9]+'), self._expand_number, text)
-    return text
-  def _remove_commas(self, m): return m.group(1).replace(',', '') # george won't like this
-  def _expand_dollars(self, m):
-    match = m.group(1)
-    parts = match.split('.')
-    if len(parts) > 2: return match + ' dollars'  # Unexpected format
-    dollars, cents = int(parts[0]) if parts[0] else 0, int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents: return '%s %s, %s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars', cents, 'cent' if cents == 1 else 'cents')
-    if dollars: return '%s %s' % (dollars, 'dollar' if dollars == 1 else 'dollars')
-    if cents: return '%s %s' % (cents, 'cent' if cents == 1 else 'cents')
-    return 'zero dollars'
-  def _expand_decimal_point(self, m): return m.group(1).replace('.', ' point ')
-  def _expand_ordinal(self, m): return self._inflect.number_to_words(m.group(0))
-  def _expand_number(self, _inflect, m):
-    num = int(m.group(0))
-    if 1000 < num < 3000:
-      if num == 2000: return 'two thousand'
-      if 2000 < num < 2010: return 'two thousand ' + self._inflect.number_to_words(num % 100)
-      if num % 100 == 0: return self._inflect.number_to_words(num // 100) + ' hundred'
-      return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
-    return self._inflect.number_to_words(num, andword='')
-
-#########################################################################################
-# PAPER: https://arxiv.org/abs/2106.06103
-# CODE: https://github.com/jaywalnut310/vits/tree/main
-#########################################################################################
-# INSTALLATION: this is based on default config, dependencies are for preprocessing.
-# vctk, ljs                      | pip3 install unidecode phonemizer          | phonemizer requires [eSpeak](https://espeak.sourceforge.net) backend to be installed on your system
-# mmts-tts                       | pip3 install unidecode                     |
-# uma_trilingual, cjks, voistock | pip3 install unidecode inflect eng_to_ipa  |
-#########################################################################################
-# Some good speakers to try out, there may be much better ones, I only tried out a few:
-# male vctk 1  | --model_to_use vctk --speaker_id 2
-# male vctk 2  | --model_to_use vctk --speaker_id 6
-# anime lady 1 | --model_to_use uma_trilingual --speaker_id 36
-# anime lady 2 | --model_to_use uma_trilingual --speaker_id 121
-#########################################################################################
-VITS_PATH = Path(__file__).parents[1] / "weights/VITS/"
-MODELS = { # config_url, weights_url
-  "ljs": ("https://raw.githubusercontent.com/jaywalnut310/vits/main/configs/ljs_base.json", "https://drive.google.com/uc?export=download&id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT&confirm=t"),
-  "vctk": ("https://huggingface.co/csukuangfj/vits-vctk/resolve/main/vctk_base.json", "https://huggingface.co/csukuangfj/vits-vctk/resolve/main/pretrained_vctk.pth"),
-  "mmts-tts": ("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/config.json", "https://huggingface.co/facebook/mms-tts/resolve/main/full_models/eng/G_100000.pth"),
-  "uma_trilingual": ("https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/raw/main/configs/uma_trilingual.json", "https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth"),
-  "cjks": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/14/model.pth"),
-  "voistock": ("https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/config.json", "https://huggingface.co/spaces/skytnt/moe-tts/resolve/main/saved_model/15/model.pth"),
-}
-Y_LENGTH_ESTIMATE_SCALARS = {"ljs": 2.8, "vctk": 1.74, "mmts-tts": 1.9, "uma_trilingual": 2.3, "cjks": 3.3, "voistock": 3.1}
-if __name__ == '__main__':
-  logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-  parser = argparse.ArgumentParser()
-  parser.add_argument("--model_to_use", default="vctk", help="Specify the model to use. Default is 'vctk'.")
-  parser.add_argument("--speaker_id", type=int, default=6, help="Specify the speaker ID. Default is 6.")
-  parser.add_argument("--out_path", default=None, help="Specify the full output path. Overrides the --out_dir and --name parameter.")
-  parser.add_argument("--out_dir", default=str(Path(__file__).parents[1] / "temp"), help="Specify the output path.")
-  parser.add_argument("--base_name", default="test", help="Specify the base of the output file name. Default is 'test'.")
-  parser.add_argument("--text_to_synthesize", default="""Hello person. If the code you are contributing isn't some of the highest quality code you've written in your life, either put in the effort to make it great, or don't bother.""", help="Specify the text to synthesize. Default is a greeting message.")
-  parser.add_argument("--noise_scale", type=float, default=0.667, help="Specify the noise scale. Default is 0.667.")
-  parser.add_argument("--noise_scale_w", type=float, default=0.8, help="Specify the noise scale w. Default is 0.8.")
-  parser.add_argument("--length_scale", type=float, default=1, help="Specify the length scale. Default is 1.")
-  parser.add_argument("--seed", type=int, default=1337, help="Specify the seed (set to None if no seed). Default is 1337.")
-  parser.add_argument("--num_channels", type=int, default=1, help="Specify the number of audio output channels. Default is 1.")
-  parser.add_argument("--sample_width", type=int, default=2, help="Specify the number of bytes per sample, adjust if necessary. Default is 2.")
-  parser.add_argument("--emotion_path", type=str, default=None, help="Specify the path to emotion reference.")
-  parser.add_argument("--estimate_max_y_length", type=str, default=False, help="If true, overestimate the output length and then trim it to the correct length, to prevent premature realization, much more performant for larger inputs, for smaller inputs not so much. Default is False.")
-  args = parser.parse_args()
-
-  model_config = MODELS[args.model_to_use]
-
-  # Load the hyperparameters from the config file.
-  hps = get_hparams_from_file(fetch(model_config[0]))
-
-  # If model has multiple speakers, validate speaker id and retrieve name if available.
-  model_has_multiple_speakers = hps.data.n_speakers > 0
-  if model_has_multiple_speakers:
-    logging.info(f"Model has {hps.data.n_speakers} speakers")
-    if args.speaker_id >= hps.data.n_speakers: raise ValueError(f"Speaker ID {args.speaker_id} is invalid for this model.")
-    speaker_name = "?"
-    if hps.__contains__("speakers"): # maps speaker ids to names
-      speakers = hps.speakers
-      if isinstance(speakers, List): speakers = {speaker: i for i, speaker in enumerate(speakers)}
-      speaker_name = next((key for key, value in speakers.items() if value == args.speaker_id), None)
-    logging.info(f"You selected speaker {args.speaker_id} (name: {speaker_name})")
-
-  # Load emotions if any. TODO: find an english model with emotions, this is untested atm.
-  emotion_embedding = None
-  if args.emotion_path is not None:
-    if args.emotion_path.endswith(".npy"): emotion_embedding = Tensor(np.load(args.emotion_path), dtype=dtypes.int64).unsqueeze(0)
-    else: raise ValueError("Emotion path must be a .npy file.")
-
-  # Load symbols, instantiate TextMapper and clean the text.
-  if hps.__contains__("symbols"): symbols = hps.symbols
-  elif args.model_to_use == "mmts-tts": symbols = [x.replace("\n", "") for x in fetch("https://huggingface.co/facebook/mms-tts/raw/main/full_models/eng/vocab.txt").open(encoding="utf-8").readlines()]
-  else: symbols = ['_'] + list(';:,.!?¡¿—…"«»“” ') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz') + list("ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ")
-  text_mapper = TextMapper(apply_cleaners=True, symbols=symbols)
-
-  # Load the model.
-  if args.seed is not None:
-    Tensor.manual_seed(args.seed)
-    np.random.seed(args.seed)
-  net_g = load_model(text_mapper.symbols, hps, model_config)
-  logging.debug(f"Loaded model with hps: {hps}")
-
-  # Convert the input text to a tensor.
-  text_to_synthesize = args.text_to_synthesize
-  if args.model_to_use == "mmts-tts": text_to_synthesize = text_mapper.filter_oov(text_to_synthesize.lower())
-  stn_tst = text_mapper.get_text(text_to_synthesize, hps.data.add_blank, hps.data.text_cleaners)
-  logging.debug(f"Converted input text to tensor \"{text_to_synthesize}\" -> Tensor({stn_tst.shape}): {stn_tst.numpy()}")
-  x_tst, x_tst_lengths = stn_tst.unsqueeze(0), Tensor([stn_tst.shape[0]], dtype=dtypes.int64)
-  sid = Tensor([args.speaker_id], dtype=dtypes.int64) if model_has_multiple_speakers else None
-
-  # Perform inference.
-  start_time = time.time()
-  audio_tensor = net_g.infer(x_tst, x_tst_lengths, sid, args.noise_scale, args.length_scale, args.noise_scale_w, emotion_embedding=emotion_embedding,
-                             max_y_length_estimate_scale=Y_LENGTH_ESTIMATE_SCALARS[args.model_to_use] if args.estimate_max_y_length else None)[0, 0].realize()
-  logging.info(f"Inference took {(time.time() - start_time):.2f}s")
-
-  # Save the audio output.
-  audio_data = (np.clip(audio_tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
-  out_path = Path(args.out_path or Path(args.out_dir)/f"{args.model_to_use}{f'_sid_{args.speaker_id}' if model_has_multiple_speakers else ''}_{args.base_name}.wav")
-  out_path.parent.mkdir(parents=True, exist_ok=True)
-  with wave.open(str(out_path), 'wb') as wav_file:
-    wav_file.setnchannels(args.num_channels)
-    wav_file.setsampwidth(args.sample_width)
-    wav_file.setframerate(hps.data.sampling_rate)
-    wav_file.setnframes(len(audio_data))
-    wav_file.writeframes(audio_data.tobytes())
-  logging.info(f"Saved audio output to {out_path}")
diff --git a/extra/datasets/coco.py b/extra/datasets/coco.py
deleted file mode 100644
index 0952e37701..0000000000
--- a/extra/datasets/coco.py
+++ /dev/null
@@ -1,199 +0,0 @@
-import json
-import pathlib
-import zipfile
-import numpy as np
-from tinygrad.helpers import fetch
-import pycocotools._mask as _mask
-from examples.mask_rcnn import Masker
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-
-iou         = _mask.iou
-merge       = _mask.merge
-frPyObjects = _mask.frPyObjects
-
-BASEDIR = pathlib.Path(__file__).parent / "COCO"
-BASEDIR.mkdir(exist_ok=True)
-
-def create_dict(key_row, val_row, rows): return {row[key_row]:row[val_row] for row in rows}
-
-
-if not pathlib.Path(BASEDIR/'val2017').is_dir():
-  fn = fetch('http://images.cocodataset.org/zips/val2017.zip')
-  with zipfile.ZipFile(fn, 'r') as zip_ref:
-    zip_ref.extractall(BASEDIR)
-  fn.unlink()
-
-
-if not pathlib.Path(BASEDIR/'annotations').is_dir():
-  fn = fetch('http://images.cocodataset.org/annotations/annotations_trainval2017.zip')
-  with zipfile.ZipFile(fn, 'r') as zip_ref:
-    zip_ref.extractall(BASEDIR)
-  fn.unlink()
-
-with open(BASEDIR/'annotations/instances_val2017.json', 'r') as f:
-  annotations_raw = json.loads(f.read())
-images = annotations_raw['images']
-categories = annotations_raw['categories']
-annotations = annotations_raw['annotations']
-file_name_to_id = create_dict('file_name', 'id', images)
-id_to_width = create_dict('id', 'width', images)
-id_to_height = create_dict('id', 'height', images)
-json_category_id_to_contiguous_id = {v['id']: i + 1 for i, v in enumerate(categories)}
-contiguous_category_id_to_json_id = {v:k for k,v in json_category_id_to_contiguous_id.items()}
-
-
-def encode(bimask):
-  if len(bimask.shape) == 3:
-    return _mask.encode(bimask)
-  elif len(bimask.shape) == 2:
-    h, w = bimask.shape
-    return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
-
-def decode(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.decode(rleObjs)
-  else:
-    return _mask.decode([rleObjs])[:,:,0]
-
-def area(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.area(rleObjs)
-  else:
-    return _mask.area([rleObjs])[0]
-
-def toBbox(rleObjs):
-  if type(rleObjs) == list:
-    return _mask.toBbox(rleObjs)
-  else:
-    return _mask.toBbox([rleObjs])[0]
-
-
-def convert_prediction_to_coco_bbox(file_name, prediction):
-  coco_results = []
-  try:
-    original_id = file_name_to_id[file_name]
-    if len(prediction) == 0:
-      return coco_results
-
-    image_width = id_to_width[original_id]
-    image_height = id_to_height[original_id]
-    prediction = prediction.resize((image_width, image_height))
-    prediction = prediction.convert("xywh")
-
-    boxes = prediction.bbox.numpy().tolist()
-    scores = prediction.get_field("scores").numpy().tolist()
-    labels = prediction.get_field("labels").numpy().tolist()
-
-    mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
-
-    coco_results.extend(
-      [
-        {
-          "image_id": original_id,
-          "category_id": mapped_labels[k],
-          "bbox": box,
-          "score": scores[k],
-        }
-          for k, box in enumerate(boxes)
-      ]
-    )
-  except Exception as e:
-    print(file_name, e)
-  return coco_results
-
-masker = Masker(threshold=0.5, padding=1)
-
-def convert_prediction_to_coco_mask(file_name, prediction):
-  coco_results = []
-  try:
-    original_id = file_name_to_id[file_name]
-    if len(prediction) == 0:
-      return coco_results
-
-    image_width = id_to_width[original_id]
-    image_height = id_to_height[original_id]
-    prediction = prediction.resize((image_width, image_height))
-    masks = prediction.get_field("mask")
-
-    scores = prediction.get_field("scores").numpy().tolist()
-    labels = prediction.get_field("labels").numpy().tolist()
-
-    masks = masker([masks], [prediction])[0].numpy()
-
-    rles = [
-      encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0]
-      for mask in masks
-    ]
-    for rle in rles:
-      rle["counts"] = rle["counts"].decode("utf-8")
-
-    mapped_labels = [contiguous_category_id_to_json_id[int(i)] for i in labels]
-
-    coco_results.extend(
-      [
-        {
-          "image_id": original_id,
-          "category_id": mapped_labels[k],
-          "segmentation": rle,
-          "score": scores[k],
-        }
-          for k, rle in enumerate(rles)
-      ]
-    )
-  except Exception as e:
-    print(file_name, e)
-  return coco_results
-
-
-
-def accumulate_predictions_for_coco(coco_results, json_result_file, rm=False):
-  path = pathlib.Path(json_result_file)
-  if rm and path.exists(): path.unlink()
-  with open(path, "a") as f:
-    for s in coco_results:
-      f.write(json.dumps(s))
-      f.write('\n')
-
-def remove_dup(l):
-  seen = set()
-  seen_add = seen.add
-  return [x for x in l if not (x in seen or seen_add(x))]
-
-class NpEncoder(json.JSONEncoder):
-  def default(self, obj):
-    if isinstance(obj, np.integer):
-      return int(obj)
-    if isinstance(obj, np.floating):
-      return float(obj)
-    if isinstance(obj, np.ndarray):
-      return obj.tolist()
-    return super(NpEncoder, self).default(obj)
-
-
-def evaluate_predictions_on_coco(json_result_file, iou_type="bbox"):
-  coco_results = []
-  with open(json_result_file, "r") as f:
-    for line in f:
-      coco_results.append(json.loads(line))
-
-  coco_gt = COCO(str(BASEDIR/'annotations/instances_val2017.json'))
-  set_of_json = remove_dup([json.dumps(d, cls=NpEncoder) for d in coco_results])
-  unique_list = [json.loads(s) for s in set_of_json]
-
-  with open(f'{json_result_file}.flattend', "w") as f:
-    json.dump(unique_list, f)
-
-  coco_dt = coco_gt.loadRes(str(f'{json_result_file}.flattend'))
-  coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
-  coco_eval.evaluate()
-  coco_eval.accumulate()
-  coco_eval.summarize()
-  return coco_eval
-
-def iterate(files, bs=1):
-  batch = []
-  for file in files:
-    batch.append(file)
-    if len(batch) >= bs: yield batch; batch = []
-  if len(batch) > 0: yield batch; batch = []
diff --git a/extra/disassemblers/adreno/.gitignore b/extra/disassemblers/adreno/.gitignore
deleted file mode 100644
index 40e681866a..0000000000
--- a/extra/disassemblers/adreno/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-disasm.so
\ No newline at end of file
diff --git a/extra/disassemblers/adreno/README b/extra/disassemblers/adreno/README
deleted file mode 100644
index 2a534d4795..0000000000
--- a/extra/disassemblers/adreno/README
+++ /dev/null
@@ -1,5 +0,0 @@
-From the Freedreno project
-
-https://gallium.readthedocs.io/en/latest/gallium/drivers/freedreno.html
-
-In Mesa3D, so licensed MIT.
diff --git a/extra/disassemblers/adreno/disasm-a3xx.c b/extra/disassemblers/adreno/disasm-a3xx.c
deleted file mode 100644
index 711528996d..0000000000
--- a/extra/disassemblers/adreno/disasm-a3xx.c
+++ /dev/null
@@ -1,1431 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-#include <assert.h>
-
-//#include <util/u_debug.h>
-
-#include "util/macros.h"
-#include "instr-a3xx.h"
-
-/* bitmask of debug flags */
-enum debug_t {
-  PRINT_RAW      = 0x1,    /* dump raw hexdump */
-  PRINT_VERBOSE  = 0x2,
-  EXPAND_REPEAT  = 0x4,
-};
-
-static enum debug_t debug = PRINT_RAW | PRINT_VERBOSE | EXPAND_REPEAT;
-
-static const char *levels[] = {
-    "",
-    "\t",
-    "\t\t",
-    "\t\t\t",
-    "\t\t\t\t",
-    "\t\t\t\t\t",
-    "\t\t\t\t\t\t",
-    "\t\t\t\t\t\t\t",
-    "\t\t\t\t\t\t\t\t",
-    "\t\t\t\t\t\t\t\t\t",
-    "x",
-    "x",
-    "x",
-    "x",
-    "x",
-    "x",
-};
-
-static const char *component = "xyzw";
-
-static const char *type[] = {
-    [TYPE_F16] = "f16",
-    [TYPE_F32] = "f32",
-    [TYPE_U16] = "u16",
-    [TYPE_U32] = "u32",
-    [TYPE_S16] = "s16",
-    [TYPE_S32] = "s32",
-    [TYPE_U8]  = "u8",
-    [TYPE_S8]  = "s8",
-};
-
-struct disasm_ctx {
-  FILE *out;
-  int level;
-  unsigned gpu_id;
-
-  /* current instruction repeat flag: */
-  unsigned repeat;
-  /* current instruction repeat indx/offset (for --expand): */
-  unsigned repeatidx;
-
-  unsigned instructions;
-};
-
-static const char *float_imms[] = {
-  "0.0",
-  "0.5",
-  "1.0",
-  "2.0",
-  "e",
-  "pi",
-  "1/pi",
-  "1/log2(e)",
-  "log2(e)",
-  "1/log2(10)",
-  "log2(10)",
-  "4.0",
-};
-
-static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full,
-    bool is_float, bool r,
-    bool c, bool im, bool neg, bool abs, bool addr_rel)
-{
-  const char type = c ? 'c' : 'r';
-
-  // XXX I prefer - and || for neg/abs, but preserving format used
-  // by libllvm-a3xx for easy diffing..
-
-  if (abs && neg)
-    fprintf(ctx->out, "(absneg)");
-  else if (neg)
-    fprintf(ctx->out, "(neg)");
-  else if (abs)
-    fprintf(ctx->out, "(abs)");
-
-  if (r)
-    fprintf(ctx->out, "(r)");
-
-  if (im) {
-    if (is_float && full && reg.iim_val < ARRAY_SIZE(float_imms)) {
-      fprintf(ctx->out, "(%s)", float_imms[reg.iim_val]);
-    } else {
-      fprintf(ctx->out, "%d", reg.iim_val);
-    }
-  } else if (addr_rel) {
-    /* I would just use %+d but trying to make it diff'able with
-     * libllvm-a3xx...
-     */
-    if (reg.iim_val < 0)
-      fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
-    else if (reg.iim_val > 0)
-      fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
-    else
-      fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
-  } else if ((reg.num == REG_A0) && !c) {
-    /* This matches libllvm output, the second (scalar) address register
-     * seems to be called a1.x instead of a0.y.
-     */
-    fprintf(ctx->out, "a%d.x", reg.comp);
-  } else if ((reg.num == REG_P0) && !c) {
-    fprintf(ctx->out, "p0.%c", component[reg.comp]);
-  } else {
-    fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
-  }
-}
-
-static unsigned regidx(reg_t reg)
-{
-  return (4 * reg.num) + reg.comp;
-}
-
-static reg_t idxreg(unsigned idx)
-{
-  return (reg_t){
-    .comp = idx & 0x3,
-    .num  = idx >> 2,
-  };
-}
-
-static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
-{
-  reg = idxreg(regidx(reg) + ctx->repeatidx);
-  print_reg(ctx, reg, full, false, false, false, false, false, false, addr_rel);
-}
-
-/* TODO switch to using reginfo struct everywhere, since more readable
- * than passing a bunch of bools to print_reg_src
- */
-
-struct reginfo {
-  reg_t reg;
-  bool full;
-  bool r;
-  bool c;
-  bool f; /* src reg is interpreted as float, used for printing immediates */
-  bool im;
-  bool neg;
-  bool abs;
-  bool addr_rel;
-};
-
-static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
-{
-  reg_t reg = info->reg;
-
-  if (info->r)
-    reg = idxreg(regidx(info->reg) + ctx->repeatidx);
-
-  print_reg(ctx, reg, info->full, info->f, info->r, info->c, info->im,
-      info->neg, info->abs, info->addr_rel);
-}
-
-//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
-//{
-//  print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
-//}
-
-static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
-{
-  static const struct {
-    const char *suffix;
-    int nsrc;
-    bool idx;
-  } brinfo[7] = {
-    [BRANCH_PLAIN] = { "r",   1, false },
-    [BRANCH_OR]    = { "rao", 2, false },
-    [BRANCH_AND]   = { "raa", 2, false },
-    [BRANCH_CONST] = { "rac", 0, true  },
-    [BRANCH_ANY]   = { "any", 1, false },
-    [BRANCH_ALL]   = { "all", 1, false },
-    [BRANCH_X]     = { "rax", 0, false },
-  };
-  instr_cat0_t *cat0 = &instr->cat0;
-
-  switch (instr_opc(instr, ctx->gpu_id)) {
-  case OPC_KILL:
-  case OPC_PREDT:
-  case OPC_PREDF:
-    fprintf(ctx->out, " %sp0.%c", cat0->inv0 ? "!" : "",
-        component[cat0->comp0]);
-    break;
-  case OPC_B:
-    fprintf(ctx->out, "%s", brinfo[cat0->brtype].suffix);
-    if (brinfo[cat0->brtype].idx) {
-      fprintf(ctx->out, ".%u", cat0->idx);
-    }
-    if (brinfo[cat0->brtype].nsrc >= 1) {
-      fprintf(ctx->out, " %sp0.%c,", cat0->inv0 ? "!" : "",
-          component[cat0->comp0]);
-    }
-    if (brinfo[cat0->brtype].nsrc >= 2) {
-      fprintf(ctx->out, " %sp0.%c,", cat0->inv1 ? "!" : "",
-          component[cat0->comp1]);
-    }
-    fprintf(ctx->out, " #%d", cat0->a3xx.immed);
-    break;
-  case OPC_JUMP:
-  case OPC_CALL:
-  case OPC_BKT:
-  case OPC_GETONE:
-  case OPC_SHPS:
-    fprintf(ctx->out, " #%d", cat0->a3xx.immed);
-    break;
-  }
-
-  if ((debug & PRINT_VERBOSE) && (cat0->dummy3|cat0->dummy4))
-    fprintf(ctx->out, "\t{0: %x,%x}", cat0->dummy3, cat0->dummy4);
-}
-
-static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
-{
-  instr_cat1_t *cat1 = &instr->cat1;
-
-  if (cat1->ul)
-    fprintf(ctx->out, "(ul)");
-
-  if (cat1->src_type == cat1->dst_type) {
-    if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
-      /* special case (nmemonic?): */
-      fprintf(ctx->out, "mova");
-    } else {
-      fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-    }
-  } else {
-    fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-  }
-
-  fprintf(ctx->out, " ");
-
-  if (cat1->even)
-    fprintf(ctx->out, "(even)");
-
-  if (cat1->pos_inf)
-    fprintf(ctx->out, "(pos_infinity)");
-
-  print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
-      cat1->dst_rel);
-
-  fprintf(ctx->out, ", ");
-
-  /* ugg, have to special case this.. vs print_reg().. */
-  if (cat1->src_im) {
-    if (type_float(cat1->src_type))
-      fprintf(ctx->out, "(%f)", cat1->fim_val);
-    else if (type_uint(cat1->src_type))
-      fprintf(ctx->out, "0x%08x", cat1->uim_val);
-    else
-      fprintf(ctx->out, "%d", cat1->iim_val);
-  } else if (cat1->src_rel && !cat1->src_c) {
-    /* I would just use %+d but trying to make it diff'able with
-     * libllvm-a3xx...
-     */
-    char type = cat1->src_rel_c ? 'c' : 'r';
-    const char *full = (type_size(cat1->src_type) == 32) ? "" : "h";
-    if (cat1->off < 0)
-      fprintf(ctx->out, "%s%c<a0.x - %d>", full, type, -cat1->off);
-    else if (cat1->off > 0)
-      fprintf(ctx->out, "%s%c<a0.x + %d>", full, type, cat1->off);
-    else
-      fprintf(ctx->out, "%s%c<a0.x>", full, type);
-  } else {
-    struct reginfo src = {
-      .reg = (reg_t)cat1->src,
-      .full = type_size(cat1->src_type) == 32,
-      .r = cat1->src_r,
-      .c = cat1->src_c,
-      .im = cat1->src_im,
-    };
-    print_src(ctx, &src);
-  }
-
-  if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
-    fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
-}
-
-static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
-{
-  instr_cat2_t *cat2 = &instr->cat2;
-  int opc = _OPC(2, cat2->opc);
-  static const char *cond[] = {
-      "lt",
-      "le",
-      "gt",
-      "ge",
-      "eq",
-      "ne",
-      "?6?",
-  };
-
-  switch (opc) {
-  case OPC_CMPS_F:
-  case OPC_CMPS_U:
-  case OPC_CMPS_S:
-  case OPC_CMPV_F:
-  case OPC_CMPV_U:
-  case OPC_CMPV_S:
-    fprintf(ctx->out, ".%s", cond[cat2->cond]);
-    break;
-  }
-
-  fprintf(ctx->out, " ");
-  if (cat2->ei)
-    fprintf(ctx->out, "(ei)");
-  print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
-  fprintf(ctx->out, ", ");
-
-  struct reginfo src1 = {
-    .full = cat2->full,
-    .r = cat2->repeat ? cat2->src1_r : 0,
-    .f = is_cat2_float(opc),
-    .im = cat2->src1_im,
-    .abs = cat2->src1_abs,
-    .neg = cat2->src1_neg,
-  };
-
-  if (cat2->c1.src1_c) {
-    src1.reg = (reg_t)(cat2->c1.src1);
-    src1.c = true;
-  } else if (cat2->rel1.src1_rel) {
-    src1.reg = (reg_t)(cat2->rel1.src1);
-    src1.c = cat2->rel1.src1_c;
-    src1.addr_rel = true;
-  } else {
-    src1.reg = (reg_t)(cat2->src1);
-  }
-  print_src(ctx, &src1);
-
-  struct reginfo src2 = {
-    .r = cat2->repeat ? cat2->src2_r : 0,
-    .full = cat2->full,
-    .f = is_cat2_float(opc),
-    .abs = cat2->src2_abs,
-    .neg = cat2->src2_neg,
-    .im = cat2->src2_im,
-  };
-  switch (opc) {
-  case OPC_ABSNEG_F:
-  case OPC_ABSNEG_S:
-  case OPC_CLZ_B:
-  case OPC_CLZ_S:
-  case OPC_SIGN_F:
-  case OPC_FLOOR_F:
-  case OPC_CEIL_F:
-  case OPC_RNDNE_F:
-  case OPC_RNDAZ_F:
-  case OPC_TRUNC_F:
-  case OPC_NOT_B:
-  case OPC_BFREV_B:
-  case OPC_SETRM:
-  case OPC_CBITS_B:
-    /* these only have one src reg */
-    break;
-  default:
-    fprintf(ctx->out, ", ");
-    if (cat2->c2.src2_c) {
-      src2.reg = (reg_t)(cat2->c2.src2);
-      src2.c = true;
-    } else if (cat2->rel2.src2_rel) {
-      src2.reg = (reg_t)(cat2->rel2.src2);
-      src2.c = cat2->rel2.src2_c;
-      src2.addr_rel = true;
-    } else {
-      src2.reg = (reg_t)(cat2->src2);
-    }
-    print_src(ctx, &src2);
-    break;
-  }
-}
-
-static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
-{
-  instr_cat3_t *cat3 = &instr->cat3;
-  bool full = instr_cat3_full(cat3);
-
-  fprintf(ctx->out, " ");
-  print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
-  fprintf(ctx->out, ", ");
-
-  struct reginfo src1 = {
-    .r = cat3->repeat ? cat3->src1_r : 0,
-    .full = full,
-    .neg = cat3->src1_neg,
-  };
-  if (cat3->c1.src1_c) {
-    src1.reg = (reg_t)(cat3->c1.src1);
-    src1.c = true;
-  } else if (cat3->rel1.src1_rel) {
-    src1.reg = (reg_t)(cat3->rel1.src1);
-    src1.c = cat3->rel1.src1_c;
-    src1.addr_rel = true;
-  } else {
-    src1.reg = (reg_t)(cat3->src1);
-  }
-  print_src(ctx, &src1);
-
-  fprintf(ctx->out, ", ");
-  struct reginfo src2 = {
-    .reg = (reg_t)cat3->src2,
-    .full = full,
-    .r = cat3->repeat ? cat3->src2_r : 0,
-    .c = cat3->src2_c,
-    .neg = cat3->src2_neg,
-  };
-  print_src(ctx, &src2);
-
-  fprintf(ctx->out, ", ");
-  struct reginfo src3 = {
-    .r = cat3->src3_r,
-    .full = full,
-    .neg = cat3->src3_neg,
-  };
-  if (cat3->c2.src3_c) {
-    src3.reg = (reg_t)(cat3->c2.src3);
-    src3.c = true;
-  } else if (cat3->rel2.src3_rel) {
-    src3.reg = (reg_t)(cat3->rel2.src3);
-    src3.c = cat3->rel2.src3_c;
-    src3.addr_rel = true;
-  } else {
-    src3.reg = (reg_t)(cat3->src3);
-  }
-  print_src(ctx, &src3);
-}
-
-static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
-{
-  instr_cat4_t *cat4 = &instr->cat4;
-
-  fprintf(ctx->out, " ");
-  print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
-  fprintf(ctx->out, ", ");
-
-  struct reginfo src = {
-    .r = cat4->src_r,
-    .im = cat4->src_im,
-    .full = cat4->full,
-    .neg = cat4->src_neg,
-    .abs = cat4->src_abs,
-  };
-  if (cat4->c.src_c) {
-    src.reg = (reg_t)(cat4->c.src);
-    src.c = true;
-  } else if (cat4->rel.src_rel) {
-    src.reg = (reg_t)(cat4->rel.src);
-    src.c = cat4->rel.src_c;
-    src.addr_rel = true;
-  } else {
-    src.reg = (reg_t)(cat4->src);
-  }
-  print_src(ctx, &src);
-
-  if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
-    fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
-}
-
-static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
-{
-  static const struct {
-    bool src1, src2, samp, tex;
-  } info[0x1f] = {
-      [opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
-      [opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
-      [opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
-      [opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
-      [opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
-      [opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
-      [opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
-      [opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
-      [opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
-      [opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
-      [opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
-      [opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
-      [opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
-      [opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
-      [opc_op(OPC_DSX)]      = { true,  false, false, false, },
-      [opc_op(OPC_DSY)]      = { true,  false, false, false, },
-      [opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
-      [opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
-      [opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
-      [opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
-      [opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
-      [opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
-      [opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
-      [opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
-      [opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
-      [opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
-      [opc_op(OPC_RGETPOS)]  = { true,  false, false, false, },
-      [opc_op(OPC_RGETINFO)] = { false, false, false, false, },
-  };
-
-  static const struct {
-    bool indirect;
-    bool bindless;
-    bool use_a1;
-    bool uniform;
-  } desc_features[8] = {
-    [CAT5_NONUNIFORM] = { .indirect = true, },
-    [CAT5_UNIFORM] = { .indirect = true, .uniform = true, },
-    [CAT5_BINDLESS_IMM] = { .bindless = true, },
-    [CAT5_BINDLESS_UNIFORM] = {
-      .bindless = true,
-      .indirect = true,
-      .uniform = true,
-    },
-    [CAT5_BINDLESS_NONUNIFORM] = {
-      .bindless = true,
-      .indirect = true,
-    },
-    [CAT5_BINDLESS_A1_IMM] = {
-      .bindless = true,
-      .use_a1 = true,
-    },
-    [CAT5_BINDLESS_A1_UNIFORM] = {
-      .bindless = true,
-      .indirect = true,
-      .uniform = true,
-      .use_a1 = true,
-    },
-    [CAT5_BINDLESS_A1_NONUNIFORM] = {
-      .bindless = true,
-      .indirect = true,
-      .use_a1 = true,
-    },
-  };
-
-  instr_cat5_t *cat5 = &instr->cat5;
-  int i;
-
-  bool desc_indirect =
-    cat5->is_s2en_bindless &&
-    desc_features[cat5->s2en_bindless.desc_mode].indirect;
-  bool bindless =
-    cat5->is_s2en_bindless &&
-    desc_features[cat5->s2en_bindless.desc_mode].bindless;
-  bool use_a1 =
-    cat5->is_s2en_bindless &&
-    desc_features[cat5->s2en_bindless.desc_mode].use_a1;
-  bool uniform =
-    cat5->is_s2en_bindless &&
-    desc_features[cat5->s2en_bindless.desc_mode].uniform;
-
-  if (cat5->is_3d)   fprintf(ctx->out, ".3d");
-  if (cat5->is_a)    fprintf(ctx->out, ".a");
-  if (cat5->is_o)    fprintf(ctx->out, ".o");
-  if (cat5->is_p)    fprintf(ctx->out, ".p");
-  if (cat5->is_s)    fprintf(ctx->out, ".s");
-  if (desc_indirect) fprintf(ctx->out, ".s2en");
-  if (uniform)       fprintf(ctx->out, ".uniform");
-
-  if (bindless) {
-    unsigned base = (cat5->s2en_bindless.base_hi << 1) | cat5->base_lo;
-    fprintf(ctx->out, ".base%d", base);
-  }
-
-  fprintf(ctx->out, " ");
-
-  switch (_OPC(5, cat5->opc)) {
-  case OPC_DSXPP_1:
-  case OPC_DSYPP_1:
-    break;
-  default:
-    fprintf(ctx->out, "(%s)", type[cat5->type]);
-    break;
-  }
-
-  fprintf(ctx->out, "(");
-  for (i = 0; i < 4; i++)
-    if (cat5->wrmask & (1 << i))
-      fprintf(ctx->out, "%c", "xyzw"[i]);
-  fprintf(ctx->out, ")");
-
-  print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
-
-  if (info[cat5->opc].src1) {
-    fprintf(ctx->out, ", ");
-    struct reginfo src = { .reg = (reg_t)(cat5->src1), .full = cat5->full };
-    print_src(ctx, &src);
-  }
-
-  if (cat5->is_o || info[cat5->opc].src2) {
-    fprintf(ctx->out, ", ");
-    struct reginfo src = { .reg = (reg_t)(cat5->src2), .full = cat5->full };
-    print_src(ctx, &src);
-  }
-  if (cat5->is_s2en_bindless) {
-    if (!desc_indirect) {
-      if (info[cat5->opc].samp) {
-        if (use_a1)
-          fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3);
-        else
-          fprintf(ctx->out, ", s#%d", cat5->s2en_bindless.src3 & 0xf);
-      }
-
-      if (info[cat5->opc].tex && !use_a1) {
-        fprintf(ctx->out, ", t#%d", cat5->s2en_bindless.src3 >> 4);
-      }
-    }
-  } else {
-    if (info[cat5->opc].samp)
-      fprintf(ctx->out, ", s#%d", cat5->norm.samp);
-    if (info[cat5->opc].tex)
-      fprintf(ctx->out, ", t#%d", cat5->norm.tex);
-  }
-
-  if (desc_indirect) {
-    fprintf(ctx->out, ", ");
-    struct reginfo src = { .reg = (reg_t)(cat5->s2en_bindless.src3), .full = bindless };
-    print_src(ctx, &src);
-  }
-
-  if (use_a1)
-    fprintf(ctx->out, ", a1.x");
-
-  if (debug & PRINT_VERBOSE) {
-    if (cat5->is_s2en_bindless) {
-      if ((debug & PRINT_VERBOSE) && cat5->s2en_bindless.dummy1)
-        fprintf(ctx->out, "\t{5: %x}", cat5->s2en_bindless.dummy1);
-    } else {
-      if ((debug & PRINT_VERBOSE) && cat5->norm.dummy1)
-        fprintf(ctx->out, "\t{5: %x}", cat5->norm.dummy1);
-    }
-  }
-}
-
-static void print_instr_cat6_a3xx(struct disasm_ctx *ctx, instr_t *instr)
-{
-  instr_cat6_t *cat6 = &instr->cat6;
-  char sd = 0, ss = 0;  /* dst/src address space */
-  bool nodst = false;
-  struct reginfo dst, src1, src2;
-  int src1off = 0, dstoff = 0;
-
-  memset(&dst, 0, sizeof(dst));
-  memset(&src1, 0, sizeof(src1));
-  memset(&src2, 0, sizeof(src2));
-
-  switch (_OPC(6, cat6->opc)) {
-  case OPC_RESINFO:
-  case OPC_RESFMT:
-    dst.full  = type_size(cat6->type) == 32;
-    src1.full = type_size(cat6->type) == 32;
-    src2.full = type_size(cat6->type) == 32;
-    break;
-  case OPC_L2G:
-  case OPC_G2L:
-    dst.full = true;
-    src1.full = true;
-    src2.full = true;
-    break;
-  case OPC_STG:
-  case OPC_STL:
-  case OPC_STP:
-  case OPC_STLW:
-  case OPC_STIB:
-    dst.full  = type_size(cat6->type) == 32;
-    src1.full = type_size(cat6->type) == 32;
-    src2.full = type_size(cat6->type) == 32;
-    break;
-  default:
-    dst.full  = type_size(cat6->type) == 32;
-    src1.full = true;
-    src2.full = true;
-    break;
-  }
-
-  switch (_OPC(6, cat6->opc)) {
-  case OPC_PREFETCH:
-    break;
-  case OPC_RESINFO:
-    fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-    break;
-  case OPC_LDGB:
-    fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
-    fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-    fprintf(ctx->out, ".%s", type[cat6->type]);
-    fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
-    break;
-  case OPC_STGB:
-  case OPC_STIB:
-    fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
-    fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
-    fprintf(ctx->out, ".%s", type[cat6->type]);
-    fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
-    break;
-  case OPC_ATOMIC_ADD:
-  case OPC_ATOMIC_SUB:
-  case OPC_ATOMIC_XCHG:
-  case OPC_ATOMIC_INC:
-  case OPC_ATOMIC_DEC:
-  case OPC_ATOMIC_CMPXCHG:
-  case OPC_ATOMIC_MIN:
-  case OPC_ATOMIC_MAX:
-  case OPC_ATOMIC_AND:
-  case OPC_ATOMIC_OR:
-  case OPC_ATOMIC_XOR:
-    ss = cat6->g ? 'g' : 'l';
-    fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
-    fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-    fprintf(ctx->out, ".%s", type[cat6->type]);
-    fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
-    fprintf(ctx->out, ".%c", ss);
-    break;
-  default:
-    dst.im = cat6->g && !cat6->dst_off;
-    fprintf(ctx->out, ".%s", type[cat6->type]);
-    break;
-  }
-  fprintf(ctx->out, " ");
-
-  switch (_OPC(6, cat6->opc)) {
-  case OPC_STG:
-    sd = 'g';
-    break;
-  case OPC_STP:
-    sd = 'p';
-    break;
-  case OPC_STL:
-  case OPC_STLW:
-    sd = 'l';
-    break;
-
-  case OPC_LDG:
-  case OPC_LDC:
-    ss = 'g';
-    break;
-  case OPC_LDP:
-    ss = 'p';
-    break;
-  case OPC_LDL:
-  case OPC_LDLW:
-  case OPC_LDLV:
-    ss = 'l';
-    break;
-
-  case OPC_L2G:
-    ss = 'l';
-    sd = 'g';
-    break;
-
-  case OPC_G2L:
-    ss = 'g';
-    sd = 'l';
-    break;
-
-  case OPC_PREFETCH:
-    ss = 'g';
-    nodst = true;
-    break;
-  }
-
-  if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
-    struct reginfo src3;
-
-    memset(&src3, 0, sizeof(src3));
-
-    src1.reg = (reg_t)(cat6->stgb.src1);
-    src2.reg = (reg_t)(cat6->stgb.src2);
-    src2.im  = cat6->stgb.src2_im;
-    src3.reg = (reg_t)(cat6->stgb.src3);
-    src3.im  = cat6->stgb.src3_im;
-    src3.full = true;
-
-    fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
-    print_src(ctx, &src1);
-    fprintf(ctx->out, ", ");
-    print_src(ctx, &src2);
-    fprintf(ctx->out, ", ");
-    print_src(ctx, &src3);
-
-    if (debug & PRINT_VERBOSE)
-      fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
-
-    return;
-  }
-
-  if (is_atomic(_OPC(6, cat6->opc))) {
-
-    src1.reg = (reg_t)(cat6->ldgb.src1);
-    src1.im  = cat6->ldgb.src1_im;
-    src2.reg = (reg_t)(cat6->ldgb.src2);
-    src2.im  = cat6->ldgb.src2_im;
-    dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-    print_src(ctx, &dst);
-    fprintf(ctx->out, ", ");
-    if (ss == 'g') {
-      struct reginfo src3;
-      memset(&src3, 0, sizeof(src3));
-
-      src3.reg = (reg_t)(cat6->ldgb.src3);
-      src3.full = true;
-
-      /* For images, the ".typed" variant is used and src2 is
-       * the ivecN coordinates, ie ivec2 for 2d.
-       *
-       * For SSBOs, the ".untyped" variant is used and src2 is
-       * a simple dword offset..  src3 appears to be
-       * uvec2(offset * 4, 0).  Not sure the point of that.
-       */
-
-      fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
-      print_src(ctx, &src1);  /* value */
-      fprintf(ctx->out, ", ");
-      print_src(ctx, &src2);  /* offset/coords */
-      fprintf(ctx->out, ", ");
-      print_src(ctx, &src3);  /* 64b byte offset.. */
-
-      if (debug & PRINT_VERBOSE) {
-        fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
-            cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-      }
-    } else { /* ss == 'l' */
-      fprintf(ctx->out, "l[");
-      print_src(ctx, &src1);  /* simple byte offset */
-      fprintf(ctx->out, "], ");
-      print_src(ctx, &src2);  /* value */
-
-      if (debug & PRINT_VERBOSE) {
-        fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
-            cat6->ldgb.src3, cat6->ldgb.pad0,
-            cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-      }
-    }
-
-    return;
-  } else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
-    dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-    print_src(ctx, &dst);
-    fprintf(ctx->out, ", ");
-    fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
-
-    return;
-  } else if (_OPC(6, cat6->opc) == OPC_LDGB) {
-
-    src1.reg = (reg_t)(cat6->ldgb.src1);
-    src1.im  = cat6->ldgb.src1_im;
-    src2.reg = (reg_t)(cat6->ldgb.src2);
-    src2.im  = cat6->ldgb.src2_im;
-    dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-    print_src(ctx, &dst);
-    fprintf(ctx->out, ", ");
-    fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
-    print_src(ctx, &src1);
-    fprintf(ctx->out, ", ");
-    print_src(ctx, &src2);
-
-    if (debug & PRINT_VERBOSE)
-      fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-
-    return;
-  } else if (_OPC(6, cat6->opc) == OPC_LDG && cat6->a.src1_im && cat6->a.src2_im) {
-    struct reginfo src3;
-
-    memset(&src3, 0, sizeof(src3));
-    src1.reg = (reg_t)(cat6->a.src1);
-    src2.reg = (reg_t)(cat6->a.src2);
-    src2.im  = cat6->a.src2_im;
-    src3.reg = (reg_t)(cat6->a.off);
-    src3.full = true;
-    dst.reg  = (reg_t)(cat6->d.dst);
-
-    print_src(ctx, &dst);
-    fprintf(ctx->out, ", g[");
-    print_src(ctx, &src1);
-    fprintf(ctx->out, "+");
-    print_src(ctx, &src3);
-    fprintf(ctx->out, "], ");
-    print_src(ctx, &src2);
-
-    return;
-  }
-  if (cat6->dst_off) {
-    dst.reg = (reg_t)(cat6->c.dst);
-    dstoff  = cat6->c.off;
-  } else {
-    dst.reg = (reg_t)(cat6->d.dst);
-  }
-
-  if (cat6->src_off) {
-    src1.reg = (reg_t)(cat6->a.src1);
-    src1.im  = cat6->a.src1_im;
-    src2.reg = (reg_t)(cat6->a.src2);
-    src2.im  = cat6->a.src2_im;
-    src1off  = cat6->a.off;
-  } else {
-    src1.reg = (reg_t)(cat6->b.src1);
-    src1.im  = cat6->b.src1_im;
-    src2.reg = (reg_t)(cat6->b.src2);
-    src2.im  = cat6->b.src2_im;
-  }
-
-  if (!nodst) {
-    if (sd)
-      fprintf(ctx->out, "%c[", sd);
-    /* note: dst might actually be a src (ie. address to store to) */
-    print_src(ctx, &dst);
-    if (cat6->dst_off && cat6->g) {
-      struct reginfo dstoff_reg = {0};
-      dstoff_reg.reg = (reg_t) cat6->c.off;
-      dstoff_reg.full  = true;
-      fprintf(ctx->out, "+");
-      print_src(ctx, &dstoff_reg);
-    } else if (dstoff)
-      fprintf(ctx->out, "%+d", dstoff);
-    if (sd)
-      fprintf(ctx->out, "]");
-    fprintf(ctx->out, ", ");
-  }
-
-  if (ss)
-    fprintf(ctx->out, "%c[", ss);
-
-  /* can have a larger than normal immed, so hack: */
-  if (src1.im) {
-    fprintf(ctx->out, "%u", src1.reg.dummy13);
-  } else {
-    print_src(ctx, &src1);
-  }
-
-  if (cat6->src_off && cat6->g)
-    print_src(ctx, &src2);
-  else if (src1off)
-    fprintf(ctx->out, "%+d", src1off);
-  if (ss)
-    fprintf(ctx->out, "]");
-
-  switch (_OPC(6, cat6->opc)) {
-  case OPC_RESINFO:
-  case OPC_RESFMT:
-    break;
-  default:
-    fprintf(ctx->out, ", ");
-    print_src(ctx, &src2);
-    break;
-  }
-}
-
-static void print_instr_cat6_a6xx(struct disasm_ctx *ctx, instr_t *instr)
-{
-  instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
-  struct reginfo src1, src2, ssbo;
-  bool uses_type = _OPC(6, cat6->opc) != OPC_LDC;
-
-  static const struct {
-    bool indirect;
-    bool bindless;
-    const char *name;
-  } desc_features[8] = {
-    [CAT6_IMM] = {
-      .name = "imm"
-    },
-    [CAT6_UNIFORM] = {
-      .indirect = true,
-      .name = "uniform"
-    },
-    [CAT6_NONUNIFORM] = {
-      .indirect = true,
-      .name = "nonuniform"
-    },
-    [CAT6_BINDLESS_IMM] = {
-      .bindless = true,
-      .name = "imm"
-    },
-    [CAT6_BINDLESS_UNIFORM] = {
-      .bindless = true,
-      .indirect = true,
-      .name = "uniform"
-    },
-    [CAT6_BINDLESS_NONUNIFORM] = {
-      .bindless = true,
-      .indirect = true,
-      .name = "nonuniform"
-    },
-  };
-
-  bool indirect_ssbo = desc_features[cat6->desc_mode].indirect;
-  bool bindless = desc_features[cat6->desc_mode].bindless;
-  bool type_full = cat6->type != TYPE_U16;
-
-
-  memset(&src1, 0, sizeof(src1));
-  memset(&src2, 0, sizeof(src2));
-  memset(&ssbo, 0, sizeof(ssbo));
-
-  if (uses_type) {
-    fprintf(ctx->out, ".%s", cat6->typed ? "typed" : "untyped");
-    fprintf(ctx->out, ".%dd", cat6->d + 1);
-    fprintf(ctx->out, ".%s", type[cat6->type]);
-  } else {
-    fprintf(ctx->out, ".offset%d", cat6->d);
-  }
-  fprintf(ctx->out, ".%u", cat6->type_size + 1);
-
-  fprintf(ctx->out, ".%s", desc_features[cat6->desc_mode].name);
-  if (bindless)
-    fprintf(ctx->out, ".base%d", cat6->base);
-  fprintf(ctx->out, " ");
-
-  src2.reg = (reg_t)(cat6->src2);
-  src2.full = type_full;
-  print_src(ctx, &src2);
-  fprintf(ctx->out, ", ");
-
-  src1.reg = (reg_t)(cat6->src1);
-  src1.full = true; // XXX
-  print_src(ctx, &src1);
-  fprintf(ctx->out, ", ");
-  ssbo.reg = (reg_t)(cat6->ssbo);
-  ssbo.im = !indirect_ssbo;
-  ssbo.full = true;
-  print_src(ctx, &ssbo);
-
-  if (debug & PRINT_VERBOSE) {
-    fprintf(ctx->out, " (pad1=%x, pad2=%x, pad3=%x, pad4=%x, pad5=%x)",
-        cat6->pad1, cat6->pad2, cat6->pad3, cat6->pad4, cat6->pad5);
-  }
-}
-
-static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
-{
-  if (!is_cat6_legacy(instr, ctx->gpu_id)) {
-    print_instr_cat6_a6xx(ctx, instr);
-    if (debug & PRINT_VERBOSE)
-      fprintf(ctx->out, " NEW");
-  } else {
-    print_instr_cat6_a3xx(ctx, instr);
-    if (debug & PRINT_VERBOSE)
-      fprintf(ctx->out, " LEGACY");
-  }
-}
-static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
-{
-  instr_cat7_t *cat7 = &instr->cat7;
-
-  if (cat7->g)
-    fprintf(ctx->out, ".g");
-  if (cat7->l)
-    fprintf(ctx->out, ".l");
-
-  if (_OPC(7, cat7->opc) == OPC_FENCE) {
-    if (cat7->r)
-      fprintf(ctx->out, ".r");
-    if (cat7->w)
-      fprintf(ctx->out, ".w");
-  }
-}
-
-/* size of largest OPC field of all the instruction categories: */
-#define NOPC_BITS 6
-
-static const struct opc_info {
-  uint16_t cat;
-  uint16_t opc;
-  const char *name;
-  void (*print)(struct disasm_ctx *ctx, instr_t *instr);
-} opcs[1 << (3+NOPC_BITS)] = {
-#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
-  /* category 0: */
-  OPC(0, OPC_NOP,          nop),
-  OPC(0, OPC_B,            b),
-  OPC(0, OPC_JUMP,         jump),
-  OPC(0, OPC_CALL,         call),
-  OPC(0, OPC_RET,          ret),
-  OPC(0, OPC_KILL,         kill),
-  OPC(0, OPC_END,          end),
-  OPC(0, OPC_EMIT,         emit),
-  OPC(0, OPC_CUT,          cut),
-  OPC(0, OPC_CHMASK,       chmask),
-  OPC(0, OPC_CHSH,         chsh),
-  OPC(0, OPC_FLOW_REV,     flow_rev),
-  OPC(0, OPC_PREDT,        predt),
-  OPC(0, OPC_PREDF,        predf),
-  OPC(0, OPC_PREDE,        prede),
-  OPC(0, OPC_BKT,          bkt),
-  OPC(0, OPC_STKS,         stks),
-  OPC(0, OPC_STKR,         stkr),
-  OPC(0, OPC_XSET,         xset),
-  OPC(0, OPC_XCLR,         xclr),
-  OPC(0, OPC_GETONE,       getone),
-  OPC(0, OPC_DBG,          dbg),
-  OPC(0, OPC_SHPS,         shps),
-  OPC(0, OPC_SHPE,         shpe),
-
-  /* category 1: */
-  OPC(1, OPC_MOV, ),
-
-  /* category 2: */
-  OPC(2, OPC_ADD_F,        add.f),
-  OPC(2, OPC_MIN_F,        min.f),
-  OPC(2, OPC_MAX_F,        max.f),
-  OPC(2, OPC_MUL_F,        mul.f),
-  OPC(2, OPC_SIGN_F,       sign.f),
-  OPC(2, OPC_CMPS_F,       cmps.f),
-  OPC(2, OPC_ABSNEG_F,     absneg.f),
-  OPC(2, OPC_CMPV_F,       cmpv.f),
-  OPC(2, OPC_FLOOR_F,      floor.f),
-  OPC(2, OPC_CEIL_F,       ceil.f),
-  OPC(2, OPC_RNDNE_F,      rndne.f),
-  OPC(2, OPC_RNDAZ_F,      rndaz.f),
-  OPC(2, OPC_TRUNC_F,      trunc.f),
-  OPC(2, OPC_ADD_U,        add.u),
-  OPC(2, OPC_ADD_S,        add.s),
-  OPC(2, OPC_SUB_U,        sub.u),
-  OPC(2, OPC_SUB_S,        sub.s),
-  OPC(2, OPC_CMPS_U,       cmps.u),
-  OPC(2, OPC_CMPS_S,       cmps.s),
-  OPC(2, OPC_MIN_U,        min.u),
-  OPC(2, OPC_MIN_S,        min.s),
-  OPC(2, OPC_MAX_U,        max.u),
-  OPC(2, OPC_MAX_S,        max.s),
-  OPC(2, OPC_ABSNEG_S,     absneg.s),
-  OPC(2, OPC_AND_B,        and.b),
-  OPC(2, OPC_OR_B,         or.b),
-  OPC(2, OPC_NOT_B,        not.b),
-  OPC(2, OPC_XOR_B,        xor.b),
-  OPC(2, OPC_CMPV_U,       cmpv.u),
-  OPC(2, OPC_CMPV_S,       cmpv.s),
-  OPC(2, OPC_MUL_U24,      mul.u24),
-  OPC(2, OPC_MUL_S24,      mul.s24),
-  OPC(2, OPC_MULL_U,       mull.u),
-  OPC(2, OPC_BFREV_B,      bfrev.b),
-  OPC(2, OPC_CLZ_S,        clz.s),
-  OPC(2, OPC_CLZ_B,        clz.b),
-  OPC(2, OPC_SHL_B,        shl.b),
-  OPC(2, OPC_SHR_B,        shr.b),
-  OPC(2, OPC_ASHR_B,       ashr.b),
-  OPC(2, OPC_BARY_F,       bary.f),
-  OPC(2, OPC_MGEN_B,       mgen.b),
-  OPC(2, OPC_GETBIT_B,     getbit.b),
-  OPC(2, OPC_SETRM,        setrm),
-  OPC(2, OPC_CBITS_B,      cbits.b),
-  OPC(2, OPC_SHB,          shb),
-  OPC(2, OPC_MSAD,         msad),
-
-  /* category 3: */
-  OPC(3, OPC_MAD_U16,      mad.u16),
-  OPC(3, OPC_MADSH_U16,    madsh.u16),
-  OPC(3, OPC_MAD_S16,      mad.s16),
-  OPC(3, OPC_MADSH_M16,    madsh.m16),
-  OPC(3, OPC_MAD_U24,      mad.u24),
-  OPC(3, OPC_MAD_S24,      mad.s24),
-  OPC(3, OPC_MAD_F16,      mad.f16),
-  OPC(3, OPC_MAD_F32,      mad.f32),
-  OPC(3, OPC_SEL_B16,      sel.b16),
-  OPC(3, OPC_SEL_B32,      sel.b32),
-  OPC(3, OPC_SEL_S16,      sel.s16),
-  OPC(3, OPC_SEL_S32,      sel.s32),
-  OPC(3, OPC_SEL_F16,      sel.f16),
-  OPC(3, OPC_SEL_F32,      sel.f32),
-  OPC(3, OPC_SAD_S16,      sad.s16),
-  OPC(3, OPC_SAD_S32,      sad.s32),
-
-  /* category 4: */
-  OPC(4, OPC_RCP,          rcp),
-  OPC(4, OPC_RSQ,          rsq),
-  OPC(4, OPC_LOG2,         log2),
-  OPC(4, OPC_EXP2,         exp2),
-  OPC(4, OPC_SIN,          sin),
-  OPC(4, OPC_COS,          cos),
-  OPC(4, OPC_SQRT,         sqrt),
-  OPC(4, OPC_HRSQ,         hrsq),
-  OPC(4, OPC_HLOG2,        hlog2),
-  OPC(4, OPC_HEXP2,        hexp2),
-
-  /* category 5: */
-  OPC(5, OPC_ISAM,         isam),
-  OPC(5, OPC_ISAML,        isaml),
-  OPC(5, OPC_ISAMM,        isamm),
-  OPC(5, OPC_SAM,          sam),
-  OPC(5, OPC_SAMB,         samb),
-  OPC(5, OPC_SAML,         saml),
-  OPC(5, OPC_SAMGQ,        samgq),
-  OPC(5, OPC_GETLOD,       getlod),
-  OPC(5, OPC_CONV,         conv),
-  OPC(5, OPC_CONVM,        convm),
-  OPC(5, OPC_GETSIZE,      getsize),
-  OPC(5, OPC_GETBUF,       getbuf),
-  OPC(5, OPC_GETPOS,       getpos),
-  OPC(5, OPC_GETINFO,      getinfo),
-  OPC(5, OPC_DSX,          dsx),
-  OPC(5, OPC_DSY,          dsy),
-  OPC(5, OPC_GATHER4R,     gather4r),
-  OPC(5, OPC_GATHER4G,     gather4g),
-  OPC(5, OPC_GATHER4B,     gather4b),
-  OPC(5, OPC_GATHER4A,     gather4a),
-  OPC(5, OPC_SAMGP0,       samgp0),
-  OPC(5, OPC_SAMGP1,       samgp1),
-  OPC(5, OPC_SAMGP2,       samgp2),
-  OPC(5, OPC_SAMGP3,       samgp3),
-  OPC(5, OPC_DSXPP_1,      dsxpp.1),
-  OPC(5, OPC_DSYPP_1,      dsypp.1),
-  OPC(5, OPC_RGETPOS,      rgetpos),
-  OPC(5, OPC_RGETINFO,     rgetinfo),
-
-
-  /* category 6: */
-  OPC(6, OPC_LDG,          ldg),
-  OPC(6, OPC_LDL,          ldl),
-  OPC(6, OPC_LDP,          ldp),
-  OPC(6, OPC_STG,          stg),
-  OPC(6, OPC_STL,          stl),
-  OPC(6, OPC_STP,          stp),
-  OPC(6, OPC_LDIB,         ldib),
-  OPC(6, OPC_G2L,          g2l),
-  OPC(6, OPC_L2G,          l2g),
-  OPC(6, OPC_PREFETCH,     prefetch),
-  OPC(6, OPC_LDLW,         ldlw),
-  OPC(6, OPC_STLW,         stlw),
-  OPC(6, OPC_RESFMT,       resfmt),
-  OPC(6, OPC_RESINFO,      resinfo),
-  OPC(6, OPC_ATOMIC_ADD,     atomic.add),
-  OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
-  OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
-  OPC(6, OPC_ATOMIC_INC,     atomic.inc),
-  OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
-  OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
-  OPC(6, OPC_ATOMIC_MIN,     atomic.min),
-  OPC(6, OPC_ATOMIC_MAX,     atomic.max),
-  OPC(6, OPC_ATOMIC_AND,     atomic.and),
-  OPC(6, OPC_ATOMIC_OR,      atomic.or),
-  OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
-  OPC(6, OPC_LDGB,         ldgb),
-  OPC(6, OPC_STGB,         stgb),
-  OPC(6, OPC_STIB,         stib),
-  OPC(6, OPC_LDC,          ldc),
-  OPC(6, OPC_LDLV,         ldlv),
-
-  OPC(7, OPC_BAR,          bar),
-  OPC(7, OPC_FENCE,        fence),
-
-#undef OPC
-};
-
-#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
-
-// XXX hack.. probably should move this table somewhere common:
-#include "ir3.h"
-const char *ir3_instr_name(struct ir3_instruction *instr)
-{
-  if (opc_cat(instr->opc) == -1) return "??meta??";
-  return opcs[instr->opc].name;
-}
-
-static void print_single_instr(struct disasm_ctx *ctx, instr_t *instr)
-{
-  const char *name = GETINFO(instr)->name;
-  uint32_t opc = instr_opc(instr, ctx->gpu_id);
-
-  if (name) {
-    fprintf(ctx->out, "%s", name);
-    GETINFO(instr)->print(ctx, instr);
-  } else {
-    fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
-
-    switch (instr->opc_cat) {
-    case 0: print_instr_cat0(ctx, instr); break;
-    case 1: print_instr_cat1(ctx, instr); break;
-    case 2: print_instr_cat2(ctx, instr); break;
-    case 3: print_instr_cat3(ctx, instr); break;
-    case 4: print_instr_cat4(ctx, instr); break;
-    case 5: print_instr_cat5(ctx, instr); break;
-    case 6: print_instr_cat6(ctx, instr); break;
-    case 7: print_instr_cat7(ctx, instr); break;
-    }
-  }
-}
-
-static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
-{
-  instr_t *instr = (instr_t *)dwords;
-  uint32_t opc = instr_opc(instr, ctx->gpu_id);
-  unsigned nop = 0;
-  unsigned cycles = ctx->instructions;
-
-  if (debug & PRINT_VERBOSE) {
-    fprintf(ctx->out, "%s%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
-        n, cycles++, dwords[1], dwords[0]);
-  }
-
-  /* NOTE: order flags are printed is a bit fugly.. but for now I
-   * try to match the order in llvm-a3xx disassembler for easy
-   * diff'ing..
-   */
-
-  ctx->repeat = instr_repeat(instr);
-  ctx->instructions += 1 + ctx->repeat;
-
-  if (instr->sync) {
-    fprintf(ctx->out, "(sy)");
-  }
-  if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7))) {
-    fprintf(ctx->out, "(ss)");
-  }
-  if (instr->jmp_tgt)
-    fprintf(ctx->out, "(jp)");
-  if ((instr->opc_cat == 0) && instr->cat0.eq)
-    fprintf(ctx->out, "(eq)");
-  if (instr_sat(instr))
-    fprintf(ctx->out, "(sat)");
-  if (ctx->repeat)
-    fprintf(ctx->out, "(rpt%d)", ctx->repeat);
-  else if ((instr->opc_cat == 2) && (instr->cat2.src1_r || instr->cat2.src2_r))
-    nop = (instr->cat2.src2_r * 2) + instr->cat2.src1_r;
-  else if ((instr->opc_cat == 3) && (instr->cat3.src1_r || instr->cat3.src2_r))
-    nop = (instr->cat3.src2_r * 2) + instr->cat3.src1_r;
-  ctx->instructions += nop;
-  if (nop)
-    fprintf(ctx->out, "(nop%d) ", nop);
-
-  if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
-    fprintf(ctx->out, "(ul)");
-
-  print_single_instr(ctx, instr);
-  fprintf(ctx->out, "\n");
-
-  if ((instr->opc_cat <= 4) && (debug & EXPAND_REPEAT)) {
-    int i;
-    for (i = 0; i < nop; i++) {
-      if (debug & PRINT_VERBOSE) {
-        fprintf(ctx->out, "%s%04d:%04d[                   ] ",
-            levels[ctx->level], n, cycles++);
-      }
-      fprintf(ctx->out, "nop\n");
-    }
-    for (i = 0; i < ctx->repeat; i++) {
-      ctx->repeatidx = i + 1;
-      if (debug & PRINT_VERBOSE) {
-        fprintf(ctx->out, "%s%04d:%04d[                   ] ",
-            levels[ctx->level], n, cycles++);
-      }
-      print_single_instr(ctx, instr);
-      fprintf(ctx->out, "\n");
-    }
-    ctx->repeatidx = 0;
-  }
-
-  return (instr->opc_cat == 0) && (opc == OPC_END);
-}
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
-{
-  struct disasm_ctx ctx;
-  int i;
-  int nop_count = 0;
-
-  //assert((sizedwords % 2) == 0);
-
-  memset(&ctx, 0, sizeof(ctx));
-  ctx.out = out;
-  ctx.level = level;
-  ctx.gpu_id = gpu_id;
-
-  for (i = 0; i < sizedwords; i += 2) {
-    print_instr(&ctx, &dwords[i], i/2);
-    if (dwords[i] == 0 && dwords[i + 1] == 0)
-      nop_count++;
-    else
-      nop_count = 0;
-    if (nop_count > 3)
-      break;
-  }
-
-  return 0;
-}
-
-// gcc -shared disasm-a3xx.c -o disasm.so
-void disasm(uint8_t* buf, int len) {
-  disasm_a3xx((uint32_t*)buf, len/4, 0, stdout, 630);
-}
-
-/*int main(int argc, char *argv[]) {
-  uint32_t buf[0x10000];
-  FILE *f = fopen(argv[1], "rb");
-  if (argc > 2) {
-    int seek = atoi(argv[2]);
-    printf("skip %d\n", seek);
-    fread(buf, 1, seek , f);
-  }
-  int len = fread(buf, 1, sizeof(buf), f);
-  fclose(f);
-
-  disasm_a3xx(buf, len/4, 0, stdout, 630);
-}*/
-
diff --git a/extra/disassemblers/adreno/instr-a3xx.h b/extra/disassemblers/adreno/instr-a3xx.h
deleted file mode 100644
index e4f548d639..0000000000
--- a/extra/disassemblers/adreno/instr-a3xx.h
+++ /dev/null
@@ -1,1119 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef INSTR_A3XX_H_
-#define INSTR_A3XX_H_
-
-#define PACKED __attribute__((__packed__))
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <assert.h>
-
-/* size of largest OPC field of all the instruction categories: */
-#define NOPC_BITS 6
-
-#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
-
-typedef enum {
-	/* category 0: */
-	OPC_NOP             = _OPC(0, 0),
-	OPC_B               = _OPC(0, 1),
-	OPC_JUMP            = _OPC(0, 2),
-	OPC_CALL            = _OPC(0, 3),
-	OPC_RET             = _OPC(0, 4),
-	OPC_KILL            = _OPC(0, 5),
-	OPC_END             = _OPC(0, 6),
-	OPC_EMIT            = _OPC(0, 7),
-	OPC_CUT             = _OPC(0, 8),
-	OPC_CHMASK          = _OPC(0, 9),
-	OPC_CHSH            = _OPC(0, 10),
-	OPC_FLOW_REV        = _OPC(0, 11),
-
-	OPC_BKT             = _OPC(0, 16),
-	OPC_STKS            = _OPC(0, 17),
-	OPC_STKR            = _OPC(0, 18),
-	OPC_XSET            = _OPC(0, 19),
-	OPC_XCLR            = _OPC(0, 20),
-	OPC_GETONE          = _OPC(0, 21),
-	OPC_DBG             = _OPC(0, 22),
-	OPC_SHPS            = _OPC(0, 23),   /* shader prologue start */
-	OPC_SHPE            = _OPC(0, 24),   /* shader prologue end */
-
-	OPC_PREDT           = _OPC(0, 29),   /* predicated true */
-	OPC_PREDF           = _OPC(0, 30),   /* predicated false */
-	OPC_PREDE           = _OPC(0, 31),   /* predicated end */
-
-	/* category 1: */
-	OPC_MOV             = _OPC(1, 0),
-
-	/* category 2: */
-	OPC_ADD_F           = _OPC(2, 0),
-	OPC_MIN_F           = _OPC(2, 1),
-	OPC_MAX_F           = _OPC(2, 2),
-	OPC_MUL_F           = _OPC(2, 3),
-	OPC_SIGN_F          = _OPC(2, 4),
-	OPC_CMPS_F          = _OPC(2, 5),
-	OPC_ABSNEG_F        = _OPC(2, 6),
-	OPC_CMPV_F          = _OPC(2, 7),
-	/* 8 - invalid */
-	OPC_FLOOR_F         = _OPC(2, 9),
-	OPC_CEIL_F          = _OPC(2, 10),
-	OPC_RNDNE_F         = _OPC(2, 11),
-	OPC_RNDAZ_F         = _OPC(2, 12),
-	OPC_TRUNC_F         = _OPC(2, 13),
-	/* 14-15 - invalid */
-	OPC_ADD_U           = _OPC(2, 16),
-	OPC_ADD_S           = _OPC(2, 17),
-	OPC_SUB_U           = _OPC(2, 18),
-	OPC_SUB_S           = _OPC(2, 19),
-	OPC_CMPS_U          = _OPC(2, 20),
-	OPC_CMPS_S          = _OPC(2, 21),
-	OPC_MIN_U           = _OPC(2, 22),
-	OPC_MIN_S           = _OPC(2, 23),
-	OPC_MAX_U           = _OPC(2, 24),
-	OPC_MAX_S           = _OPC(2, 25),
-	OPC_ABSNEG_S        = _OPC(2, 26),
-	/* 27 - invalid */
-	OPC_AND_B           = _OPC(2, 28),
-	OPC_OR_B            = _OPC(2, 29),
-	OPC_NOT_B           = _OPC(2, 30),
-	OPC_XOR_B           = _OPC(2, 31),
-	/* 32 - invalid */
-	OPC_CMPV_U          = _OPC(2, 33),
-	OPC_CMPV_S          = _OPC(2, 34),
-	/* 35-47 - invalid */
-	OPC_MUL_U24         = _OPC(2, 48), /* 24b mul into 32b result */
-	OPC_MUL_S24         = _OPC(2, 49), /* 24b mul into 32b result with sign extension */
-	OPC_MULL_U          = _OPC(2, 50),
-	OPC_BFREV_B         = _OPC(2, 51),
-	OPC_CLZ_S           = _OPC(2, 52),
-	OPC_CLZ_B           = _OPC(2, 53),
-	OPC_SHL_B           = _OPC(2, 54),
-	OPC_SHR_B           = _OPC(2, 55),
-	OPC_ASHR_B          = _OPC(2, 56),
-	OPC_BARY_F          = _OPC(2, 57),
-	OPC_MGEN_B          = _OPC(2, 58),
-	OPC_GETBIT_B        = _OPC(2, 59),
-	OPC_SETRM           = _OPC(2, 60),
-	OPC_CBITS_B         = _OPC(2, 61),
-	OPC_SHB             = _OPC(2, 62),
-	OPC_MSAD            = _OPC(2, 63),
-
-	/* category 3: */
-	OPC_MAD_U16         = _OPC(3, 0),
-	OPC_MADSH_U16       = _OPC(3, 1),
-	OPC_MAD_S16         = _OPC(3, 2),
-	OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
-	OPC_MAD_U24         = _OPC(3, 4),
-	OPC_MAD_S24         = _OPC(3, 5),
-	OPC_MAD_F16         = _OPC(3, 6),
-	OPC_MAD_F32         = _OPC(3, 7),
-	OPC_SEL_B16         = _OPC(3, 8),
-	OPC_SEL_B32         = _OPC(3, 9),
-	OPC_SEL_S16         = _OPC(3, 10),
-	OPC_SEL_S32         = _OPC(3, 11),
-	OPC_SEL_F16         = _OPC(3, 12),
-	OPC_SEL_F32         = _OPC(3, 13),
-	OPC_SAD_S16         = _OPC(3, 14),
-	OPC_SAD_S32         = _OPC(3, 15),
-
-	/* category 4: */
-	OPC_RCP             = _OPC(4, 0),
-	OPC_RSQ             = _OPC(4, 1),
-	OPC_LOG2            = _OPC(4, 2),
-	OPC_EXP2            = _OPC(4, 3),
-	OPC_SIN             = _OPC(4, 4),
-	OPC_COS             = _OPC(4, 5),
-	OPC_SQRT            = _OPC(4, 6),
-	/* NOTE that these are 8+opc from their highp equivs, so it's possible
-	 * that the high order bit in the opc field has been repurposed for
-	 * half-precision use?  But note that other ops (rcp/lsin/cos/sqrt)
-	 * still use the same opc as highp
-	 */
-	OPC_HRSQ            = _OPC(4, 9),
-	OPC_HLOG2           = _OPC(4, 10),
-	OPC_HEXP2           = _OPC(4, 11),
-
-	/* category 5: */
-	OPC_ISAM            = _OPC(5, 0),
-	OPC_ISAML           = _OPC(5, 1),
-	OPC_ISAMM           = _OPC(5, 2),
-	OPC_SAM             = _OPC(5, 3),
-	OPC_SAMB            = _OPC(5, 4),
-	OPC_SAML            = _OPC(5, 5),
-	OPC_SAMGQ           = _OPC(5, 6),
-	OPC_GETLOD          = _OPC(5, 7),
-	OPC_CONV            = _OPC(5, 8),
-	OPC_CONVM           = _OPC(5, 9),
-	OPC_GETSIZE         = _OPC(5, 10),
-	OPC_GETBUF          = _OPC(5, 11),
-	OPC_GETPOS          = _OPC(5, 12),
-	OPC_GETINFO         = _OPC(5, 13),
-	OPC_DSX             = _OPC(5, 14),
-	OPC_DSY             = _OPC(5, 15),
-	OPC_GATHER4R        = _OPC(5, 16),
-	OPC_GATHER4G        = _OPC(5, 17),
-	OPC_GATHER4B        = _OPC(5, 18),
-	OPC_GATHER4A        = _OPC(5, 19),
-	OPC_SAMGP0          = _OPC(5, 20),
-	OPC_SAMGP1          = _OPC(5, 21),
-	OPC_SAMGP2          = _OPC(5, 22),
-	OPC_SAMGP3          = _OPC(5, 23),
-	OPC_DSXPP_1         = _OPC(5, 24),
-	OPC_DSYPP_1         = _OPC(5, 25),
-	OPC_RGETPOS         = _OPC(5, 26),
-	OPC_RGETINFO        = _OPC(5, 27),
-
-	/* category 6: */
-	OPC_LDG             = _OPC(6, 0),        /* load-global */
-	OPC_LDL             = _OPC(6, 1),
-	OPC_LDP             = _OPC(6, 2),
-	OPC_STG             = _OPC(6, 3),        /* store-global */
-	OPC_STL             = _OPC(6, 4),
-	OPC_STP             = _OPC(6, 5),
-	OPC_LDIB            = _OPC(6, 6),
-	OPC_G2L             = _OPC(6, 7),
-	OPC_L2G             = _OPC(6, 8),
-	OPC_PREFETCH        = _OPC(6, 9),
-	OPC_LDLW            = _OPC(6, 10),
-	OPC_STLW            = _OPC(6, 11),
-	OPC_RESFMT          = _OPC(6, 14),
-	OPC_RESINFO         = _OPC(6, 15),
-	OPC_ATOMIC_ADD      = _OPC(6, 16),
-	OPC_ATOMIC_SUB      = _OPC(6, 17),
-	OPC_ATOMIC_XCHG     = _OPC(6, 18),
-	OPC_ATOMIC_INC      = _OPC(6, 19),
-	OPC_ATOMIC_DEC      = _OPC(6, 20),
-	OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
-	OPC_ATOMIC_MIN      = _OPC(6, 22),
-	OPC_ATOMIC_MAX      = _OPC(6, 23),
-	OPC_ATOMIC_AND      = _OPC(6, 24),
-	OPC_ATOMIC_OR       = _OPC(6, 25),
-	OPC_ATOMIC_XOR      = _OPC(6, 26),
-	OPC_LDGB            = _OPC(6, 27),
-	OPC_STGB            = _OPC(6, 28),
-	OPC_STIB            = _OPC(6, 29),
-	OPC_LDC             = _OPC(6, 30),
-	OPC_LDLV            = _OPC(6, 31),
-
-	/* category 7: */
-	OPC_BAR             = _OPC(7, 0),
-	OPC_FENCE           = _OPC(7, 1),
-
-	/* meta instructions (category -1): */
-	/* placeholder instr to mark shader inputs: */
-	OPC_META_INPUT      = _OPC(-1, 0),
-	/* The "collect" and "split" instructions are used for keeping
-	 * track of instructions that write to multiple dst registers
-	 * (split) like texture sample instructions, or read multiple
-	 * consecutive scalar registers (collect) (bary.f, texture samp)
-	 *
-	 * A "split" extracts a scalar component from a vecN, and a
-	 * "collect" gathers multiple scalar components into a vecN
-	 */
-	OPC_META_SPLIT      = _OPC(-1, 2),
-	OPC_META_COLLECT    = _OPC(-1, 3),
-
-	/* placeholder for texture fetches that run before FS invocation
-	 * starts:
-	 */
-	OPC_META_TEX_PREFETCH = _OPC(-1, 4),
-
-} opc_t;
-
-#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
-#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
-
-typedef enum {
-	TYPE_F16 = 0,
-	TYPE_F32 = 1,
-	TYPE_U16 = 2,
-	TYPE_U32 = 3,
-	TYPE_S16 = 4,
-	TYPE_S32 = 5,
-	TYPE_U8  = 6,
-	TYPE_S8  = 7,  // XXX I assume?
-} type_t;
-
-static inline uint32_t type_size(type_t type)
-{
-	switch (type) {
-	case TYPE_F32:
-	case TYPE_U32:
-	case TYPE_S32:
-		return 32;
-	case TYPE_F16:
-	case TYPE_U16:
-	case TYPE_S16:
-		return 16;
-	case TYPE_U8:
-	case TYPE_S8:
-		return 8;
-	default:
-		assert(0); /* invalid type */
-		return 0;
-	}
-}
-
-static inline int type_float(type_t type)
-{
-	return (type == TYPE_F32) || (type == TYPE_F16);
-}
-
-static inline int type_uint(type_t type)
-{
-	return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
-}
-
-static inline int type_sint(type_t type)
-{
-	return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
-}
-
-typedef union PACKED {
-	/* normal gpr or const src register: */
-	struct PACKED {
-		uint32_t comp  : 2;
-		uint32_t num   : 10;
-	};
-	/* for immediate val: */
-	int32_t  iim_val   : 11;
-	/* to make compiler happy: */
-	uint32_t dummy32;
-	uint32_t dummy10   : 10;
-	int32_t  idummy10  : 10;
-	uint32_t dummy11   : 11;
-	uint32_t dummy12   : 12;
-	uint32_t dummy13   : 13;
-	uint32_t dummy8    : 8;
-	int32_t  idummy13  : 13;
-	int32_t  idummy8   : 8;
-} reg_t;
-
-/* special registers: */
-#define REG_A0 61       /* address register */
-#define REG_P0 62       /* predicate register */
-
-static inline int reg_special(reg_t reg)
-{
-	return (reg.num == REG_A0) || (reg.num == REG_P0);
-}
-
-typedef enum {
-	BRANCH_PLAIN = 0,   /* br */
-	BRANCH_OR    = 1,   /* brao */
-	BRANCH_AND   = 2,   /* braa */
-	BRANCH_CONST = 3,   /* brac */
-	BRANCH_ANY   = 4,   /* bany */
-	BRANCH_ALL   = 5,   /* ball */
-	BRANCH_X     = 6,   /* brax ??? */
-} brtype_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			int16_t  immed    : 16;
-			uint32_t dummy1   : 16;
-		} a3xx;
-		struct PACKED {
-			int32_t  immed    : 20;
-			uint32_t dummy1   : 12;
-		} a4xx;
-		struct PACKED {
-			int32_t immed     : 32;
-		} a5xx;
-	};
-
-	/* dword1: */
-	uint32_t idx      : 5;  /* brac.N index */
-	uint32_t brtype   : 3;  /* branch type, see brtype_t */
-	uint32_t repeat   : 3;
-	uint32_t dummy3   : 1;
-	uint32_t ss       : 1;
-	uint32_t inv1     : 1;
-	uint32_t comp1    : 2;
-	uint32_t eq       : 1;
-	uint32_t opc_hi   : 1;  /* at least one bit */
-	uint32_t dummy4   : 2;
-	uint32_t inv0     : 1;
-	uint32_t comp0    : 2;  /* component for first src */
-	uint32_t opc      : 4;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat0_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		/* for normal src register: */
-		struct PACKED {
-			uint32_t src : 11;
-			/* at least low bit of pad must be zero or it will
-			 * look like a address relative src
-			 */
-			uint32_t pad : 21;
-		};
-		/* for address relative: */
-		struct PACKED {
-			int32_t  off : 10;
-			uint32_t src_rel_c : 1;
-			uint32_t src_rel : 1;
-			uint32_t unknown : 20;
-		};
-		/* for immediate: */
-		int32_t  iim_val;
-		uint32_t uim_val;
-		float    fim_val;
-	};
-
-	/* dword1: */
-	uint32_t dst        : 8;
-	uint32_t repeat     : 3;
-	uint32_t src_r      : 1;
-	uint32_t ss         : 1;
-	uint32_t ul         : 1;
-	uint32_t dst_type   : 3;
-	uint32_t dst_rel    : 1;
-	uint32_t src_type   : 3;
-	uint32_t src_c      : 1;
-	uint32_t src_im     : 1;
-	uint32_t even       : 1;
-	uint32_t pos_inf    : 1;
-	uint32_t must_be_0  : 2;
-	uint32_t jmp_tgt    : 1;
-	uint32_t sync       : 1;
-	uint32_t opc_cat    : 3;
-} instr_cat1_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			uint32_t src1         : 11;
-			uint32_t must_be_zero1: 2;
-			uint32_t src1_im      : 1;   /* immediate */
-			uint32_t src1_neg     : 1;   /* negate */
-			uint32_t src1_abs     : 1;   /* absolute value */
-		};
-		struct PACKED {
-			uint32_t src1         : 10;
-			uint32_t src1_c       : 1;   /* relative-const */
-			uint32_t src1_rel     : 1;   /* relative address */
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel1;
-		struct PACKED {
-			uint32_t src1         : 12;
-			uint32_t src1_c       : 1;   /* const */
-			uint32_t dummy        : 3;
-		} c1;
-	};
-
-	union PACKED {
-		struct PACKED {
-			uint32_t src2         : 11;
-			uint32_t must_be_zero2: 2;
-			uint32_t src2_im      : 1;   /* immediate */
-			uint32_t src2_neg     : 1;   /* negate */
-			uint32_t src2_abs     : 1;   /* absolute value */
-		};
-		struct PACKED {
-			uint32_t src2         : 10;
-			uint32_t src2_c       : 1;   /* relative-const */
-			uint32_t src2_rel     : 1;   /* relative address */
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel2;
-		struct PACKED {
-			uint32_t src2         : 12;
-			uint32_t src2_c       : 1;   /* const */
-			uint32_t dummy        : 3;
-		} c2;
-	};
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t repeat   : 2;
-	uint32_t sat      : 1;
-	uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
-	uint32_t ss       : 1;
-	uint32_t ul       : 1;   /* dunno */
-	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-	uint32_t ei       : 1;
-	uint32_t cond     : 3;
-	uint32_t src2_r   : 1;   /* doubles as nop1 if repeat==0 */
-	uint32_t full     : 1;   /* not half */
-	uint32_t opc      : 6;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat2_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			uint32_t src1         : 11;
-			uint32_t must_be_zero1: 2;
-			uint32_t src2_c       : 1;
-			uint32_t src1_neg     : 1;
-			uint32_t src2_r       : 1;  /* doubles as nop1 if repeat==0 */
-		};
-		struct PACKED {
-			uint32_t src1         : 10;
-			uint32_t src1_c       : 1;
-			uint32_t src1_rel     : 1;
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel1;
-		struct PACKED {
-			uint32_t src1         : 12;
-			uint32_t src1_c       : 1;
-			uint32_t dummy        : 3;
-		} c1;
-	};
-
-	union PACKED {
-		struct PACKED {
-			uint32_t src3         : 11;
-			uint32_t must_be_zero2: 2;
-			uint32_t src3_r       : 1;
-			uint32_t src2_neg     : 1;
-			uint32_t src3_neg     : 1;
-		};
-		struct PACKED {
-			uint32_t src3         : 10;
-			uint32_t src3_c       : 1;
-			uint32_t src3_rel     : 1;
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel2;
-		struct PACKED {
-			uint32_t src3         : 12;
-			uint32_t src3_c       : 1;
-			uint32_t dummy        : 3;
-		} c2;
-	};
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t repeat   : 2;
-	uint32_t sat      : 1;
-	uint32_t src1_r   : 1;   /* doubles as nop0 if repeat==0 */
-	uint32_t ss       : 1;
-	uint32_t ul       : 1;
-	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-	uint32_t src2     : 8;
-	uint32_t opc      : 4;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat3_t;
-
-static inline bool instr_cat3_full(instr_cat3_t *cat3)
-{
-	switch (_OPC(3, cat3->opc)) {
-	case OPC_MAD_F16:
-	case OPC_MAD_U16:
-	case OPC_MAD_S16:
-	case OPC_SEL_B16:
-	case OPC_SEL_S16:
-	case OPC_SEL_F16:
-	case OPC_SAD_S16:
-	case OPC_SAD_S32:  // really??
-		return false;
-	default:
-		return true;
-	}
-}
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			uint32_t src          : 11;
-			uint32_t must_be_zero1: 2;
-			uint32_t src_im       : 1;   /* immediate */
-			uint32_t src_neg      : 1;   /* negate */
-			uint32_t src_abs      : 1;   /* absolute value */
-		};
-		struct PACKED {
-			uint32_t src          : 10;
-			uint32_t src_c        : 1;   /* relative-const */
-			uint32_t src_rel      : 1;   /* relative address */
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel;
-		struct PACKED {
-			uint32_t src          : 12;
-			uint32_t src_c        : 1;   /* const */
-			uint32_t dummy        : 3;
-		} c;
-	};
-	uint32_t dummy1   : 16;  /* seem to be ignored */
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t repeat   : 2;
-	uint32_t sat      : 1;
-	uint32_t src_r    : 1;
-	uint32_t ss       : 1;
-	uint32_t ul       : 1;
-	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-	uint32_t dummy2   : 5;   /* seem to be ignored */
-	uint32_t full     : 1;   /* not half */
-	uint32_t opc      : 6;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat4_t;
-
-/* With is_bindless_s2en = 1, this determines whether bindless is enabled and
- * if so, how to get the (base, index) pair for both sampler and texture.
- * There is a single base embedded in the instruction, which is always used
- * for the texture.
- */
-typedef enum {
-	/* Use traditional GL binding model, get texture and sampler index
-	 * from src3 which is not presumed to be uniform. This is
-	 * backwards-compatible with earlier generations, where this field was
-	 * always 0 and nonuniform-indexed sampling always worked.
-	 */
-	CAT5_NONUNIFORM = 0,
-
-	/* The sampler base comes from the low 3 bits of a1.x, and the sampler
-	 * and texture index come from src3 which is presumed to be uniform.
-	 */
-	CAT5_BINDLESS_A1_UNIFORM = 1,
-
-	/* The texture and sampler share the same base, and the sampler and
-	 * texture index come from src3 which is *not* presumed to be uniform.
-	 */
-	CAT5_BINDLESS_NONUNIFORM = 2,
-
-	/* The sampler base comes from the low 3 bits of a1.x, and the sampler
-	 * and texture index come from src3 which is *not* presumed to be
-	 * uniform.
-	 */
-	CAT5_BINDLESS_A1_NONUNIFORM = 3,
-
-	/* Use traditional GL binding model, get texture and sampler index
-	 * from src3 which is presumed to be uniform.
-	 */
-	CAT5_UNIFORM = 4,
-
-	/* The texture and sampler share the same base, and the sampler and
-	 * texture index come from src3 which is presumed to be uniform.
-	 */
-	CAT5_BINDLESS_UNIFORM = 5,
-
-	/* The texture and sampler share the same base, get sampler index from low
-	 * 4 bits of src3 and texture index from high 4 bits.
-	 */
-	CAT5_BINDLESS_IMM = 6,
-
-	/* The sampler base comes from the low 3 bits of a1.x, and the texture
-	 * index comes from the next 8 bits of a1.x. The sampler index is an
-	 * immediate in src3.
-	 */
-	CAT5_BINDLESS_A1_IMM = 7,
-} cat5_desc_mode_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		/* normal case: */
-		struct PACKED {
-			uint32_t full     : 1;   /* not half */
-			uint32_t src1     : 8;
-			uint32_t src2     : 8;
-			uint32_t dummy1   : 4;   /* seem to be ignored */
-			uint32_t samp     : 4;
-			uint32_t tex      : 7;
-		} norm;
-		/* s2en case: */
-		struct PACKED {
-			uint32_t full         : 1;   /* not half */
-			uint32_t src1         : 8;
-			uint32_t src2         : 8;
-			uint32_t dummy1       : 2;
-			uint32_t base_hi      : 2;
-			uint32_t src3         : 8;
-			uint32_t desc_mode    : 3;
-		} s2en_bindless;
-		/* same in either case: */
-		// XXX I think, confirm this
-		struct PACKED {
-			uint32_t full     : 1;   /* not half */
-			uint32_t src1     : 8;
-			uint32_t src2     : 8;
-			uint32_t pad      : 15;
-		};
-	};
-
-	/* dword1: */
-	uint32_t dst              : 8;
-	uint32_t wrmask           : 4;   /* write-mask */
-	uint32_t type             : 3;
-	uint32_t base_lo          : 1;   /* used with bindless */
-	uint32_t is_3d            : 1;
-
-	uint32_t is_a             : 1;
-	uint32_t is_s             : 1;
-	uint32_t is_s2en_bindless : 1;
-	uint32_t is_o             : 1;
-	uint32_t is_p             : 1;
-
-	uint32_t opc              : 5;
-	uint32_t jmp_tgt          : 1;
-	uint32_t sync             : 1;
-	uint32_t opc_cat          : 3;
-} instr_cat5_t;
-
-/* dword0 encoding for src_off: [src1 + off], src2: */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t mustbe1  : 1;
-	int32_t  off      : 13;
-	uint32_t src1     : 8;
-	uint32_t src1_im  : 1;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t dword1;
-} instr_cat6a_t;
-
-/* dword0 encoding for !src_off: [src1], src2 */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t mustbe0  : 1;
-	uint32_t src1     : 13;
-	uint32_t ignore0  : 8;
-	uint32_t src1_im  : 1;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t dword1;
-} instr_cat6b_t;
-
-/* dword1 encoding for dst_off: */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t dword0;
-
-	/* note: there is some weird stuff going on where sometimes
-	 * cat6->a.off is involved.. but that seems like a bug in
-	 * the blob, since it is used even if !cat6->src_off
-	 * It would make sense for there to be some more bits to
-	 * bring us to 11 bits worth of offset, but not sure..
-	 */
-	int32_t off       : 8;
-	uint32_t mustbe1  : 1;
-	uint32_t dst      : 8;
-	uint32_t pad1     : 15;
-} instr_cat6c_t;
-
-/* dword1 encoding for !dst_off: */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t dword0;
-
-	uint32_t dst      : 8;
-	uint32_t mustbe0  : 1;
-	uint32_t idx      : 8;
-	uint32_t pad0     : 15;
-} instr_cat6d_t;
-
-/* ldgb and atomics..
- *
- * ldgb:      pad0=0, pad3=1
- * atomic .g: pad0=1, pad3=1
- *        .l: pad0=1, pad3=0
- */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t pad0     : 1;
-	uint32_t src3     : 8;
-	uint32_t d        : 2;
-	uint32_t typed    : 1;
-	uint32_t type_size : 2;
-	uint32_t src1     : 8;
-	uint32_t src1_im  : 1;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t mustbe0  : 1;
-	uint32_t src_ssbo : 8;
-	uint32_t pad2     : 3;  // type
-	uint32_t g        : 1;
-	uint32_t pad3     : 1;
-	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
-} instr_cat6ldgb_t;
-
-/* stgb, pad0=0, pad3=2
- */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t mustbe1  : 1;  // ???
-	uint32_t src1     : 8;
-	uint32_t d        : 2;
-	uint32_t typed    : 1;
-	uint32_t type_size : 2;
-	uint32_t pad0     : 9;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t src3     : 8;
-	uint32_t src3_im  : 1;
-	uint32_t dst_ssbo : 8;
-	uint32_t pad2     : 3;  // type
-	uint32_t pad3     : 2;
-	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
-} instr_cat6stgb_t;
-
-typedef union PACKED {
-	instr_cat6a_t a;
-	instr_cat6b_t b;
-	instr_cat6c_t c;
-	instr_cat6d_t d;
-	instr_cat6ldgb_t ldgb;
-	instr_cat6stgb_t stgb;
-	struct PACKED {
-		/* dword0: */
-		uint32_t src_off  : 1;
-		uint32_t pad1     : 31;
-
-		/* dword1: */
-		uint32_t pad2     : 8;
-		uint32_t dst_off  : 1;
-		uint32_t pad3     : 8;
-		uint32_t type     : 3;
-		uint32_t g        : 1;  /* or in some cases it means dst immed */
-		uint32_t pad4     : 1;
-		uint32_t opc      : 5;
-		uint32_t jmp_tgt  : 1;
-		uint32_t sync     : 1;
-		uint32_t opc_cat  : 3;
-	};
-} instr_cat6_t;
-
-/* Similar to cat5_desc_mode_t, describes how the descriptor is loaded.
- */
-typedef enum {
-	/* Use old GL binding model with an immediate index. */
-	CAT6_IMM = 0,
-
-	CAT6_UNIFORM = 1,
-
-	CAT6_NONUNIFORM = 2,
-
-	/* Use the bindless model, with an immediate index.
-	 */
-	CAT6_BINDLESS_IMM = 4,
-
-	/* Use the bindless model, with a uniform register index.
-	 */
-	CAT6_BINDLESS_UNIFORM = 5,
-
-	/* Use the bindless model, with a register index that isn't guaranteed
-	 * to be uniform. This presumably checks if the indices are equal and
-	 * splits up the load/store, because it works the way you would
-	 * expect.
-	 */
-	CAT6_BINDLESS_NONUNIFORM = 6,
-} cat6_desc_mode_t;
-
-/**
- * For atomic ops (which return a value):
- *
- *    pad1=1, pad3=c, pad5=3
- *    src1    - vecN offset/coords
- *    src2.x  - is actually dest register
- *    src2.y  - is 'data' except for cmpxchg where src2.y is 'compare'
- *              and src2.z is 'data'
- *
- * For stib (which does not return a value):
- *    pad1=0, pad3=c, pad5=2
- *    src1    - vecN offset/coords
- *    src2    - value to store
- *
- * For ldib:
- *    pad1=1, pad3=c, pad5=2
- *    src1    - vecN offset/coords
- *
- * for ldc (load from UBO using descriptor):
- *    pad1=0, pad3=8, pad5=2
- *
- * pad2 and pad5 are only observed to be 0.
- */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t pad1     : 1;
-	uint32_t base     : 3;
-	uint32_t pad2     : 2;
-	uint32_t desc_mode : 3;
-	uint32_t d        : 2;
-	uint32_t typed    : 1;
-	uint32_t type_size : 2;
-	uint32_t opc      : 5;
-	uint32_t pad3     : 5;
-	uint32_t src1     : 8;  /* coordinate/offset */
-
-	/* dword1: */
-	uint32_t src2     : 8;  /* or the dst for load instructions */
-	uint32_t pad4     : 1;  //mustbe0 ??
-	uint32_t ssbo     : 8;  /* ssbo/image binding point */
-	uint32_t type     : 3;
-	uint32_t pad5     : 7;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat6_a6xx_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t pad1     : 32;
-
-	/* dword1: */
-	uint32_t pad2     : 12;
-	uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
-	uint32_t pad3     : 6;
-	uint32_t w        : 1;  /* write */
-	uint32_t r        : 1;  /* read */
-	uint32_t l        : 1;  /* local */
-	uint32_t g        : 1;  /* global */
-	uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
-	uint32_t jmp_tgt  : 1;  /* (jp) */
-	uint32_t sync     : 1;  /* (sy) */
-	uint32_t opc_cat  : 3;
-} instr_cat7_t;
-
-typedef union PACKED {
-	instr_cat0_t cat0;
-	instr_cat1_t cat1;
-	instr_cat2_t cat2;
-	instr_cat3_t cat3;
-	instr_cat4_t cat4;
-	instr_cat5_t cat5;
-	instr_cat6_t cat6;
-	instr_cat6_a6xx_t cat6_a6xx;
-	instr_cat7_t cat7;
-	struct PACKED {
-		/* dword0: */
-		uint32_t pad1     : 32;
-
-		/* dword1: */
-		uint32_t pad2     : 12;
-		uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
-		uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
-		uint32_t pad3     : 13;
-		uint32_t jmp_tgt  : 1;
-		uint32_t sync     : 1;
-		uint32_t opc_cat  : 3;
-
-	};
-} instr_t;
-
-static inline uint32_t instr_repeat(instr_t *instr)
-{
-	switch (instr->opc_cat) {
-	case 0:  return instr->cat0.repeat;
-	case 1:  return instr->cat1.repeat;
-	case 2:  return instr->cat2.repeat;
-	case 3:  return instr->cat3.repeat;
-	case 4:  return instr->cat4.repeat;
-	default: return 0;
-	}
-}
-
-static inline bool instr_sat(instr_t *instr)
-{
-	switch (instr->opc_cat) {
-	case 2:  return instr->cat2.sat;
-	case 3:  return instr->cat3.sat;
-	case 4:  return instr->cat4.sat;
-	default: return false;
-	}
-}
-
-/* We can probably drop the gpu_id arg, but keeping it for now so we can
- * assert if we see something we think should be new encoding on an older
- * gpu.
- */
-static inline bool is_cat6_legacy(instr_t *instr, unsigned gpu_id)
-{
-	instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
-
-	/* At least one of these two bits is pad in all the possible
-	 * "legacy" cat6 encodings, and a analysis of all the pre-a6xx
-	 * cmdstream traces I have indicates that the pad bit is zero
-	 * in all cases.  So we can use this to detect new encoding:
-	 */
-	if ((cat6->pad3 & 0x8) && (cat6->pad5 & 0x2)) {
-		assert(gpu_id >= 600);
-		assert(instr->cat6.opc == 0);
-		return false;
-	}
-
-	return true;
-}
-
-static inline uint32_t instr_opc(instr_t *instr, unsigned gpu_id)
-{
-	switch (instr->opc_cat) {
-	case 0:  return instr->cat0.opc | instr->cat0.opc_hi << 4;
-	case 1:  return 0;
-	case 2:  return instr->cat2.opc;
-	case 3:  return instr->cat3.opc;
-	case 4:  return instr->cat4.opc;
-	case 5:  return instr->cat5.opc;
-	case 6:
-		if (!is_cat6_legacy(instr, gpu_id))
-			return instr->cat6_a6xx.opc;
-		return instr->cat6.opc;
-	case 7:  return instr->cat7.opc;
-	default: return 0;
-	}
-}
-
-static inline bool is_mad(opc_t opc)
-{
-	switch (opc) {
-	case OPC_MAD_U16:
-	case OPC_MAD_S16:
-	case OPC_MAD_U24:
-	case OPC_MAD_S24:
-	case OPC_MAD_F16:
-	case OPC_MAD_F32:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_madsh(opc_t opc)
-{
-	switch (opc) {
-	case OPC_MADSH_U16:
-	case OPC_MADSH_M16:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_atomic(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ATOMIC_ADD:
-	case OPC_ATOMIC_SUB:
-	case OPC_ATOMIC_XCHG:
-	case OPC_ATOMIC_INC:
-	case OPC_ATOMIC_DEC:
-	case OPC_ATOMIC_CMPXCHG:
-	case OPC_ATOMIC_MIN:
-	case OPC_ATOMIC_MAX:
-	case OPC_ATOMIC_AND:
-	case OPC_ATOMIC_OR:
-	case OPC_ATOMIC_XOR:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_ssbo(opc_t opc)
-{
-	switch (opc) {
-	case OPC_RESFMT:
-	case OPC_RESINFO:
-	case OPC_LDGB:
-	case OPC_STGB:
-	case OPC_STIB:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_isam(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ISAM:
-	case OPC_ISAML:
-	case OPC_ISAMM:
-		return true;
-	default:
-		return false;
-	}
-}
-
-
-static inline bool is_cat2_float(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ADD_F:
-	case OPC_MIN_F:
-	case OPC_MAX_F:
-	case OPC_MUL_F:
-	case OPC_SIGN_F:
-	case OPC_CMPS_F:
-	case OPC_ABSNEG_F:
-	case OPC_CMPV_F:
-	case OPC_FLOOR_F:
-	case OPC_CEIL_F:
-	case OPC_RNDNE_F:
-	case OPC_RNDAZ_F:
-	case OPC_TRUNC_F:
-		return true;
-
-	default:
-		return false;
-	}
-}
-
-static inline bool is_cat3_float(opc_t opc)
-{
-	switch (opc) {
-	case OPC_MAD_F16:
-	case OPC_MAD_F32:
-	case OPC_SEL_F16:
-	case OPC_SEL_F32:
-		return true;
-	default:
-		return false;
-	}
-}
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id);
-
-#endif /* INSTR_A3XX_H_ */
diff --git a/extra/disassemblers/adreno/ir3.h b/extra/disassemblers/adreno/ir3.h
deleted file mode 100644
index 5c0c297ddc..0000000000
--- a/extra/disassemblers/adreno/ir3.h
+++ /dev/null
@@ -1,1757 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IR3_H_
-#define IR3_H_
-
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "shader_enums.h"
-#include "util/list.h"
-
-#include "util/bitscan.h"
-/*#include "util/list.h"
-#include "util/set.h"
-#include "util/u_debug.h"*/
-
-#define debug_assert(x) assert(x)
-
-#include "instr-a3xx.h"
-
-/* low level intermediate representation of an adreno shader program */
-
-struct ir3_compiler;
-struct ir3;
-struct ir3_instruction;
-struct ir3_block;
-
-struct ir3_info {
-	uint32_t gpu_id;
-	uint16_t sizedwords;
-	uint16_t instrs_count;   /* expanded to account for rpt's */
-	uint16_t nops_count;     /* # of nop instructions, including nopN */
-	uint16_t mov_count;
-	uint16_t cov_count;
-	/* NOTE: max_reg, etc, does not include registers not touched
-	 * by the shader (ie. vertex fetched via VFD_DECODE but not
-	 * touched by shader)
-	 */
-	int8_t   max_reg;   /* highest GPR # used by shader */
-	int8_t   max_half_reg;
-	int16_t  max_const;
-
-	/* number of sync bits: */
-	uint16_t ss, sy;
-
-	/* estimate of number of cycles stalled on (ss) */
-	uint16_t sstall;
-
-	uint16_t last_baryf;     /* instruction # of last varying fetch */
-};
-
-struct ir3_register {
-	enum {
-		IR3_REG_CONST  = 0x001,
-		IR3_REG_IMMED  = 0x002,
-		IR3_REG_HALF   = 0x004,
-		/* high registers are used for some things in compute shaders,
-		 * for example.  Seems to be for things that are global to all
-		 * threads in a wave, so possibly these are global/shared by
-		 * all the threads in the wave?
-		 */
-		IR3_REG_HIGH   = 0x008,
-		IR3_REG_RELATIV= 0x010,
-		IR3_REG_R      = 0x020,
-		/* Most instructions, it seems, can do float abs/neg but not
-		 * integer.  The CP pass needs to know what is intended (int or
-		 * float) in order to do the right thing.  For this reason the
-		 * abs/neg flags are split out into float and int variants.  In
-		 * addition, .b (bitwise) operations, the negate is actually a
-		 * bitwise not, so split that out into a new flag to make it
-		 * more clear.
-		 */
-		IR3_REG_FNEG   = 0x040,
-		IR3_REG_FABS   = 0x080,
-		IR3_REG_SNEG   = 0x100,
-		IR3_REG_SABS   = 0x200,
-		IR3_REG_BNOT   = 0x400,
-		IR3_REG_EVEN   = 0x800,
-		IR3_REG_POS_INF= 0x1000,
-		/* (ei) flag, end-input?  Set on last bary, presumably to signal
-		 * that the shader needs no more input:
-		 */
-		IR3_REG_EI     = 0x2000,
-		/* meta-flags, for intermediate stages of IR, ie.
-		 * before register assignment is done:
-		 */
-		IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_ARRAY  = 0x8000,
-
-	} flags;
-
-	/* used for cat5 instructions, but also for internal/IR level
-	 * tracking of what registers are read/written by an instruction.
-	 * wrmask may be a bad name since it is used to represent both
-	 * src and dst that touch multiple adjacent registers.
-	 */
-	unsigned wrmask : 16;  /* up to vec16 */
-
-	/* for relative addressing, 32bits for array size is too small,
-	 * but otoh we don't need to deal with disjoint sets, so instead
-	 * use a simple size field (number of scalar components).
-	 *
-	 * Note the size field isn't important for relative const (since
-	 * we don't have to do register allocation for constants).
-	 */
-	unsigned size : 15;
-
-	bool merged : 1;    /* half-regs conflict with full regs (ie >= a6xx) */
-
-	/* normal registers:
-	 * the component is in the low two bits of the reg #, so
-	 * rN.x becomes: (N << 2) | x
-	 */
-	uint16_t num;
-	union {
-		/* immediate: */
-		int32_t  iim_val;
-		uint32_t uim_val;
-		float    fim_val;
-		/* relative: */
-		struct {
-			uint16_t id;
-			int16_t offset;
-		} array;
-	};
-
-	/* For IR3_REG_SSA, src registers contain ptr back to assigning
-	 * instruction.
-	 *
-	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
-	 * array access (although the net effect is the same, it points
-	 * back to a previous instruction that we depend on).
-	 */
-	struct ir3_instruction *instr;
-};
-
-/*
- * Stupid/simple growable array implementation:
- */
-#define DECLARE_ARRAY(type, name) \
-	unsigned name ## _count, name ## _sz; \
-	type * name;
-
-#define array_insert(ctx, arr, val) do { \
-		if (arr ## _count == arr ## _sz) { \
-			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
-			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
-		} \
-		arr[arr ##_count++] = val; \
-	} while (0)
-
-struct ir3_instruction {
-	struct ir3_block *block;
-	opc_t opc;
-	enum {
-		/* (sy) flag is set on first instruction, and after sample
-		 * instructions (probably just on RAW hazard).
-		 */
-		IR3_INSTR_SY    = 0x001,
-		/* (ss) flag is set on first instruction, and first instruction
-		 * to depend on the result of "long" instructions (RAW hazard):
-		 *
-		 *   rcp, rsq, log2, exp2, sin, cos, sqrt
-		 *
-		 * It seems to synchronize until all in-flight instructions are
-		 * completed, for example:
-		 *
-		 *   rsq hr1.w, hr1.w
-		 *   add.f hr2.z, (neg)hr2.z, hc0.y
-		 *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
-		 *   rsq hr2.x, hr2.x
-		 *   (rpt1)nop
-		 *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
-		 *   nop
-		 *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
-		 *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
-		 *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
-		 *
-		 * The last mul.f does not have (ss) set, presumably because the
-		 * (ss) on the previous instruction does the job.
-		 *
-		 * The blob driver also seems to set it on WAR hazards, although
-		 * not really clear if this is needed or just blob compiler being
-		 * sloppy.  So far I haven't found a case where removing the (ss)
-		 * causes problems for WAR hazard, but I could just be getting
-		 * lucky:
-		 *
-		 *   rcp r1.y, r3.y
-		 *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
-		 *
-		 */
-		IR3_INSTR_SS    = 0x002,
-		/* (jp) flag is set on jump targets:
-		 */
-		IR3_INSTR_JP    = 0x004,
-		IR3_INSTR_UL    = 0x008,
-		IR3_INSTR_3D    = 0x010,
-		IR3_INSTR_A     = 0x020,
-		IR3_INSTR_O     = 0x040,
-		IR3_INSTR_P     = 0x080,
-		IR3_INSTR_S     = 0x100,
-		IR3_INSTR_S2EN  = 0x200,
-		IR3_INSTR_G     = 0x400,
-		IR3_INSTR_SAT   = 0x800,
-		/* (cat5/cat6) Bindless */
-		IR3_INSTR_B     = 0x1000,
-		/* (cat5-only) Get some parts of the encoding from a1.x */
-		IR3_INSTR_A1EN  = 0x2000,
-		/* meta-flags, for intermediate stages of IR, ie.
-		 * before register assignment is done:
-		 */
-		IR3_INSTR_MARK  = 0x4000,
-		IR3_INSTR_UNUSED= 0x8000,
-	} flags;
-	uint8_t repeat;
-	uint8_t nop;
-#ifdef DEBUG
-	unsigned regs_max;
-#endif
-	unsigned regs_count;
-	struct ir3_register **regs;
-	union {
-		struct {
-			char inv;
-			char comp;
-			int  immed;
-			struct ir3_block *target;
-		} cat0;
-		struct {
-			type_t src_type, dst_type;
-		} cat1;
-		struct {
-			enum {
-				IR3_COND_LT = 0,
-				IR3_COND_LE = 1,
-				IR3_COND_GT = 2,
-				IR3_COND_GE = 3,
-				IR3_COND_EQ = 4,
-				IR3_COND_NE = 5,
-			} condition;
-		} cat2;
-		struct {
-			unsigned samp, tex;
-			unsigned tex_base : 3;
-			type_t type;
-		} cat5;
-		struct {
-			type_t type;
-			int src_offset;
-			int dst_offset;
-			int iim_val : 3;      /* for ldgb/stgb, # of components */
-			unsigned d : 3;       /* for ldc, component offset */
-			bool typed : 1;
-			unsigned base : 3;
-		} cat6;
-		struct {
-			unsigned w : 1;       /* write */
-			unsigned r : 1;       /* read */
-			unsigned l : 1;       /* local */
-			unsigned g : 1;       /* global */
-		} cat7;
-		/* for meta-instructions, just used to hold extra data
-		 * before instruction scheduling, etc
-		 */
-		struct {
-			int off;              /* component/offset */
-		} split;
-		struct {
-			/* for output collects, this maps back to the entry in the
-			 * ir3_shader_variant::outputs table.
-			 */
-			int outidx;
-		} collect;
-		struct {
-			unsigned samp, tex;
-			unsigned input_offset;
-			unsigned samp_base : 3;
-			unsigned tex_base : 3;
-		} prefetch;
-		struct {
-			/* maps back to entry in ir3_shader_variant::inputs table: */
-			int inidx;
-			/* for sysvals, identifies the sysval type.  Mostly so we can
-			 * identify the special cases where a sysval should not be DCE'd
-			 * (currently, just pre-fs texture fetch)
-			 */
-			gl_system_value sysval;
-		} input;
-	};
-
-	/* When we get to the RA stage, we need instruction's position/name: */
-	uint16_t ip;
-	uint16_t name;
-
-	/* used for per-pass extra instruction data.
-	 *
-	 * TODO we should remove the per-pass data like this and 'use_count'
-	 * and do something similar to what RA does w/ ir3_ra_instr_data..
-	 * ie. use the ir3_count_instructions pass, and then use instr->ip
-	 * to index into a table of pass-private data.
-	 */
-	void *data;
-
-	/**
-	 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
-	 */
-	struct set *uses;
-
-	int sun;            /* Sethi–Ullman number, used by sched */
-	int use_count;      /* currently just updated/used by cp */
-
-	/* Used during CP and RA stages.  For collect and shader inputs/
-	 * outputs where we need a sequence of consecutive registers,
-	 * keep track of each src instructions left (ie 'n-1') and right
-	 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
-	 * to ensure that each instruction has at most one left and at
-	 * most one right neighbor.  During the copy-propagation pass,
-	 * we only remove mov's when we can preserve this constraint.
-	 * And during the RA stage, we use the neighbor information to
-	 * allocate a block of registers in one shot.
-	 *
-	 * TODO: maybe just add something like:
-	 *   struct ir3_instruction_ref {
-	 *       struct ir3_instruction *instr;
-	 *       unsigned cnt;
-	 *   }
-	 *
-	 * Or can we get away without the refcnt stuff?  It seems like
-	 * it should be overkill..  the problem is if, potentially after
-	 * already eliminating some mov's, if you have a single mov that
-	 * needs to be grouped with it's neighbors in two different
-	 * places (ex. shader output and a collect).
-	 */
-	struct {
-		struct ir3_instruction *left, *right;
-		uint16_t left_cnt, right_cnt;
-	} cp;
-
-	/* an instruction can reference at most one address register amongst
-	 * it's src/dst registers.  Beyond that, you need to insert mov's.
-	 *
-	 * NOTE: do not write this directly, use ir3_instr_set_address()
-	 */
-	struct ir3_instruction *address;
-
-	/* Tracking for additional dependent instructions.  Used to handle
-	 * barriers, WAR hazards for arrays/SSBOs/etc.
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, deps);
-
-	/*
-	 * From PoV of instruction scheduling, not execution (ie. ignores global/
-	 * local distinction):
-	 *                            shared  image  atomic  SSBO  everything
-	 *   barrier()/            -   R/W     R/W    R/W     R/W       X
-	 *     groupMemoryBarrier()
-	 *   memoryBarrier()       -           R/W    R/W
-	 *     (but only images declared coherent?)
-	 *   memoryBarrierAtomic() -                  R/W
-	 *   memoryBarrierBuffer() -                          R/W
-	 *   memoryBarrierImage()  -           R/W
-	 *   memoryBarrierShared() -   R/W
-	 *
-	 * TODO I think for SSBO/image/shared, in cases where we can determine
-	 * which variable is accessed, we don't need to care about accesses to
-	 * different variables (unless declared coherent??)
-	 */
-	enum {
-		IR3_BARRIER_EVERYTHING = 1 << 0,
-		IR3_BARRIER_SHARED_R   = 1 << 1,
-		IR3_BARRIER_SHARED_W   = 1 << 2,
-		IR3_BARRIER_IMAGE_R    = 1 << 3,
-		IR3_BARRIER_IMAGE_W    = 1 << 4,
-		IR3_BARRIER_BUFFER_R   = 1 << 5,
-		IR3_BARRIER_BUFFER_W   = 1 << 6,
-		IR3_BARRIER_ARRAY_R    = 1 << 7,
-		IR3_BARRIER_ARRAY_W    = 1 << 8,
-	} barrier_class, barrier_conflict;
-
-	/* Entry in ir3_block's instruction list: */
-	struct list_head node;
-
-#ifdef DEBUG
-	uint32_t serialno;
-#endif
-
-	// TODO only computerator/assembler:
-	int line;
-};
-
-static inline struct ir3_instruction *
-ir3_neighbor_first(struct ir3_instruction *instr)
-{
-	int cnt = 0;
-	while (instr->cp.left) {
-		instr = instr->cp.left;
-		if (++cnt > 0xffff) {
-			debug_assert(0);
-			break;
-		}
-	}
-	return instr;
-}
-
-static inline int ir3_neighbor_count(struct ir3_instruction *instr)
-{
-	int num = 1;
-
-	debug_assert(!instr->cp.left);
-
-	while (instr->cp.right) {
-		num++;
-		instr = instr->cp.right;
-		if (num > 0xffff) {
-			debug_assert(0);
-			break;
-		}
-	}
-
-	return num;
-}
-
-struct ir3 {
-	struct ir3_compiler *compiler;
-	gl_shader_stage type;
-
-	DECLARE_ARRAY(struct ir3_instruction *, inputs);
-	DECLARE_ARRAY(struct ir3_instruction *, outputs);
-
-	/* Track bary.f (and ldlv) instructions.. this is needed in
-	 * scheduling to ensure that all varying fetches happen before
-	 * any potential kill instructions.  The hw gets grumpy if all
-	 * threads in a group are killed before the last bary.f gets
-	 * a chance to signal end of input (ei).
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, baryfs);
-
-	/* Track all indirect instructions (read and write).  To avoid
-	 * deadlock scenario where an address register gets scheduled,
-	 * but other dependent src instructions cannot be scheduled due
-	 * to dependency on a *different* address register value, the
-	 * scheduler needs to ensure that all dependencies other than
-	 * the instruction other than the address register are scheduled
-	 * before the one that writes the address register.  Having a
-	 * convenient list of instructions that reference some address
-	 * register simplifies this.
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, a0_users);
-
-	/* same for a1.x: */
-	DECLARE_ARRAY(struct ir3_instruction *, a1_users);
-
-	/* and same for instructions that consume predicate register: */
-	DECLARE_ARRAY(struct ir3_instruction *, predicates);
-
-	/* Track texture sample instructions which need texture state
-	 * patched in (for astc-srgb workaround):
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
-
-	/* List of blocks: */
-	struct list_head block_list;
-
-	/* List of ir3_array's: */
-	struct list_head array_list;
-
-	unsigned max_sun;   /* max Sethi–Ullman number */
-
-#ifdef DEBUG
-	unsigned block_count, instr_count;
-#endif
-};
-
-struct ir3_array {
-	struct list_head node;
-	unsigned length;
-	unsigned id;
-
-	struct nir_register *r;
-
-	/* To avoid array write's from getting DCE'd, keep track of the
-	 * most recent write.  Any array access depends on the most
-	 * recent write.  This way, nothing depends on writes after the
-	 * last read.  But all the writes that happen before that have
-	 * something depending on them
-	 */
-	struct ir3_instruction *last_write;
-
-	/* extra stuff used in RA pass: */
-	unsigned base;      /* base vreg name */
-	unsigned reg;       /* base physical reg */
-	uint16_t start_ip, end_ip;
-
-	/* Indicates if half-precision */
-	bool half;
-};
-
-struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
-
-struct ir3_block {
-	struct list_head node;
-	struct ir3 *shader;
-
-	const struct nir_block *nblock;
-
-	struct list_head instr_list;  /* list of ir3_instruction */
-
-	/* each block has either one or two successors.. in case of
-	 * two successors, 'condition' decides which one to follow.
-	 * A block preceding an if/else has two successors.
-	 */
-	struct ir3_instruction *condition;
-	struct ir3_block *successors[2];
-
-	struct set *predecessors;     /* set of ir3_block */
-
-	uint16_t start_ip, end_ip;
-
-	/* Track instructions which do not write a register but other-
-	 * wise must not be discarded (such as kill, stg, etc)
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, keeps);
-
-	/* used for per-pass extra block data.  Mainly used right
-	 * now in RA step to track livein/liveout.
-	 */
-	void *data;
-
-#ifdef DEBUG
-	uint32_t serialno;
-#endif
-};
-
-static inline uint32_t
-block_id(struct ir3_block *block)
-{
-#ifdef DEBUG
-	return block->serialno;
-#else
-	return (uint32_t)(unsigned long)block;
-#endif
-}
-
-struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type);
-void ir3_destroy(struct ir3 *shader);
-void * ir3_assemble(struct ir3 *shader,
-		struct ir3_info *info, uint32_t gpu_id);
-void * ir3_alloc(struct ir3 *shader, int sz);
-
-struct ir3_block * ir3_block_create(struct ir3 *shader);
-
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
-struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
-		opc_t opc, int nreg);
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
-void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
-const char *ir3_instr_name(struct ir3_instruction *instr);
-
-struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
-		int num, int flags);
-struct ir3_register * ir3_reg_clone(struct ir3 *shader,
-		struct ir3_register *reg);
-
-void ir3_instr_set_address(struct ir3_instruction *instr,
-		struct ir3_instruction *addr);
-
-static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
-{
-	if (instr->flags & IR3_INSTR_MARK)
-		return true;  /* already visited */
-	instr->flags |= IR3_INSTR_MARK;
-	return false;
-}
-
-void ir3_block_clear_mark(struct ir3_block *block);
-void ir3_clear_mark(struct ir3 *shader);
-
-unsigned ir3_count_instructions(struct ir3 *ir);
-unsigned ir3_count_instructions_ra(struct ir3 *ir);
-
-void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
-
-//#include "util/set.h"
-#define foreach_ssa_use(__use, __instr) \
-	for (struct ir3_instruction *__use = (void *)~0; \
-	     __use && (__instr)->uses; __use = NULL) \
-		set_foreach ((__instr)->uses, __entry) \
-			if ((__use = (void *)__entry->key))
-
-#define MAX_ARRAYS 16
-
-/* comp:
- *   0 - x
- *   1 - y
- *   2 - z
- *   3 - w
- */
-static inline uint32_t regid(int num, int comp)
-{
-	return (num << 2) | (comp & 0x3);
-}
-
-static inline uint32_t reg_num(struct ir3_register *reg)
-{
-	return reg->num >> 2;
-}
-
-static inline uint32_t reg_comp(struct ir3_register *reg)
-{
-	return reg->num & 0x3;
-}
-
-#define INVALID_REG      regid(63, 0)
-#define VALIDREG(r)      ((r) != INVALID_REG)
-#define CONDREG(r, val)  COND(VALIDREG(r), (val))
-
-static inline bool is_flow(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 0);
-}
-
-static inline bool is_kill(struct ir3_instruction *instr)
-{
-	return instr->opc == OPC_KILL;
-}
-
-static inline bool is_nop(struct ir3_instruction *instr)
-{
-	return instr->opc == OPC_NOP;
-}
-
-static inline bool is_same_type_reg(struct ir3_register *reg1,
-		struct ir3_register *reg2)
-{
-	unsigned type_reg1 = (reg1->flags & (IR3_REG_HIGH | IR3_REG_HALF));
-	unsigned type_reg2 = (reg2->flags & (IR3_REG_HIGH | IR3_REG_HALF));
-
-	if (type_reg1 ^ type_reg2)
-		return false;
-	else
-		return true;
-}
-
-/* Is it a non-transformative (ie. not type changing) mov?  This can
- * also include absneg.s/absneg.f, which for the most part can be
- * treated as a mov (single src argument).
- */
-static inline bool is_same_type_mov(struct ir3_instruction *instr)
-{
-	struct ir3_register *dst;
-
-	switch (instr->opc) {
-	case OPC_MOV:
-		if (instr->cat1.src_type != instr->cat1.dst_type)
-			return false;
-		/* If the type of dest reg and src reg are different,
-		 * it shouldn't be considered as same type mov
-		 */
-		if (!is_same_type_reg(instr->regs[0], instr->regs[1]))
-			return false;
-		break;
-	case OPC_ABSNEG_F:
-	case OPC_ABSNEG_S:
-		if (instr->flags & IR3_INSTR_SAT)
-			return false;
-		/* If the type of dest reg and src reg are different,
-		 * it shouldn't be considered as same type mov
-		 */
-		if (!is_same_type_reg(instr->regs[0], instr->regs[1]))
-			return false;
-		break;
-	default:
-		return false;
-	}
-
-	dst = instr->regs[0];
-
-	/* mov's that write to a0 or p0.x are special: */
-	if (dst->num == regid(REG_P0, 0))
-		return false;
-	if (reg_num(dst) == REG_A0)
-		return false;
-
-	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-		return false;
-
-	return true;
-}
-
-/* A move from const, which changes size but not type, can also be
- * folded into dest instruction in some cases.
- */
-static inline bool is_const_mov(struct ir3_instruction *instr)
-{
-	if (instr->opc != OPC_MOV)
-		return false;
-
-	if (!(instr->regs[1]->flags & IR3_REG_CONST))
-		return false;
-
-	type_t src_type = instr->cat1.src_type;
-	type_t dst_type = instr->cat1.dst_type;
-
-	return (type_float(src_type) && type_float(dst_type)) ||
-		(type_uint(src_type) && type_uint(dst_type)) ||
-		(type_sint(src_type) && type_sint(dst_type));
-}
-
-static inline bool is_alu(struct ir3_instruction *instr)
-{
-	return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
-}
-
-static inline bool is_sfu(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 4);
-}
-
-static inline bool is_tex(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 5);
-}
-
-static inline bool is_tex_or_prefetch(struct ir3_instruction *instr)
-{
-	return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
-}
-
-static inline bool is_mem(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 6);
-}
-
-static inline bool is_barrier(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 7);
-}
-
-static inline bool
-is_half(struct ir3_instruction *instr)
-{
-	return !!(instr->regs[0]->flags & IR3_REG_HALF);
-}
-
-static inline bool
-is_high(struct ir3_instruction *instr)
-{
-	return !!(instr->regs[0]->flags & IR3_REG_HIGH);
-}
-
-static inline bool
-is_store(struct ir3_instruction *instr)
-{
-	/* these instructions, the "destination" register is
-	 * actually a source, the address to store to.
-	 */
-	switch (instr->opc) {
-	case OPC_STG:
-	case OPC_STGB:
-	case OPC_STIB:
-	case OPC_STP:
-	case OPC_STL:
-	case OPC_STLW:
-	case OPC_L2G:
-	case OPC_G2L:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_load(struct ir3_instruction *instr)
-{
-	switch (instr->opc) {
-	case OPC_LDG:
-	case OPC_LDGB:
-	case OPC_LDIB:
-	case OPC_LDL:
-	case OPC_LDP:
-	case OPC_L2G:
-	case OPC_LDLW:
-	case OPC_LDC:
-	case OPC_LDLV:
-		/* probably some others too.. */
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_input(struct ir3_instruction *instr)
-{
-	/* in some cases, ldlv is used to fetch varying without
-	 * interpolation.. fortunately inloc is the first src
-	 * register in either case
-	 */
-	switch (instr->opc) {
-	case OPC_LDLV:
-	case OPC_BARY_F:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_bool(struct ir3_instruction *instr)
-{
-	switch (instr->opc) {
-	case OPC_CMPS_F:
-	case OPC_CMPS_S:
-	case OPC_CMPS_U:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_meta(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == -1);
-}
-
-static inline unsigned dest_regs(struct ir3_instruction *instr)
-{
-	if ((instr->regs_count == 0) || is_store(instr) || is_flow(instr))
-		return 0;
-
-	return util_last_bit(instr->regs[0]->wrmask);
-}
-
-static inline bool
-writes_gpr(struct ir3_instruction *instr)
-{
-	if (dest_regs(instr) == 0)
-		return false;
-	/* is dest a normal temp register: */
-	struct ir3_register *reg = instr->regs[0];
-	debug_assert(!(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)));
-	if ((reg_num(reg) == REG_A0) ||
-			(reg->num == regid(REG_P0, 0)))
-		return false;
-	return true;
-}
-
-static inline bool writes_addr0(struct ir3_instruction *instr)
-{
-	if (instr->regs_count > 0) {
-		struct ir3_register *dst = instr->regs[0];
-		return dst->num == regid(REG_A0, 0);
-	}
-	return false;
-}
-
-static inline bool writes_addr1(struct ir3_instruction *instr)
-{
-	if (instr->regs_count > 0) {
-		struct ir3_register *dst = instr->regs[0];
-		return dst->num == regid(REG_A0, 1);
-	}
-	return false;
-}
-
-static inline bool writes_pred(struct ir3_instruction *instr)
-{
-	if (instr->regs_count > 0) {
-		struct ir3_register *dst = instr->regs[0];
-		return reg_num(dst) == REG_P0;
-	}
-	return false;
-}
-
-/* returns defining instruction for reg */
-/* TODO better name */
-static inline struct ir3_instruction *ssa(struct ir3_register *reg)
-{
-	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
-		return reg->instr;
-	}
-	return NULL;
-}
-
-static inline bool conflicts(struct ir3_instruction *a,
-		struct ir3_instruction *b)
-{
-	return (a && b) && (a != b);
-}
-
-static inline bool reg_gpr(struct ir3_register *r)
-{
-	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-		return false;
-	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
-		return false;
-	return true;
-}
-
-static inline type_t half_type(type_t type)
-{
-	switch (type) {
-	case TYPE_F32: return TYPE_F16;
-	case TYPE_U32: return TYPE_U16;
-	case TYPE_S32: return TYPE_S16;
-	case TYPE_F16:
-	case TYPE_U16:
-	case TYPE_S16:
-		return type;
-	default:
-		assert(0);
-		return ~0;
-	}
-}
-
-/* some cat2 instructions (ie. those which are not float) can embed an
- * immediate:
- */
-static inline bool ir3_cat2_int(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ADD_U:
-	case OPC_ADD_S:
-	case OPC_SUB_U:
-	case OPC_SUB_S:
-	case OPC_CMPS_U:
-	case OPC_CMPS_S:
-	case OPC_MIN_U:
-	case OPC_MIN_S:
-	case OPC_MAX_U:
-	case OPC_MAX_S:
-	case OPC_CMPV_U:
-	case OPC_CMPV_S:
-	case OPC_MUL_U24:
-	case OPC_MUL_S24:
-	case OPC_MULL_U:
-	case OPC_CLZ_S:
-	case OPC_ABSNEG_S:
-	case OPC_AND_B:
-	case OPC_OR_B:
-	case OPC_NOT_B:
-	case OPC_XOR_B:
-	case OPC_BFREV_B:
-	case OPC_CLZ_B:
-	case OPC_SHL_B:
-	case OPC_SHR_B:
-	case OPC_ASHR_B:
-	case OPC_MGEN_B:
-	case OPC_GETBIT_B:
-	case OPC_CBITS_B:
-	case OPC_BARY_F:
-		return true;
-
-	default:
-		return false;
-	}
-}
-
-/* map cat2 instruction to valid abs/neg flags: */
-static inline unsigned ir3_cat2_absneg(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ADD_F:
-	case OPC_MIN_F:
-	case OPC_MAX_F:
-	case OPC_MUL_F:
-	case OPC_SIGN_F:
-	case OPC_CMPS_F:
-	case OPC_ABSNEG_F:
-	case OPC_CMPV_F:
-	case OPC_FLOOR_F:
-	case OPC_CEIL_F:
-	case OPC_RNDNE_F:
-	case OPC_RNDAZ_F:
-	case OPC_TRUNC_F:
-	case OPC_BARY_F:
-		return IR3_REG_FABS | IR3_REG_FNEG;
-
-	case OPC_ADD_U:
-	case OPC_ADD_S:
-	case OPC_SUB_U:
-	case OPC_SUB_S:
-	case OPC_CMPS_U:
-	case OPC_CMPS_S:
-	case OPC_MIN_U:
-	case OPC_MIN_S:
-	case OPC_MAX_U:
-	case OPC_MAX_S:
-	case OPC_CMPV_U:
-	case OPC_CMPV_S:
-	case OPC_MUL_U24:
-	case OPC_MUL_S24:
-	case OPC_MULL_U:
-	case OPC_CLZ_S:
-		return 0;
-
-	case OPC_ABSNEG_S:
-		return IR3_REG_SABS | IR3_REG_SNEG;
-
-	case OPC_AND_B:
-	case OPC_OR_B:
-	case OPC_NOT_B:
-	case OPC_XOR_B:
-	case OPC_BFREV_B:
-	case OPC_CLZ_B:
-	case OPC_SHL_B:
-	case OPC_SHR_B:
-	case OPC_ASHR_B:
-	case OPC_MGEN_B:
-	case OPC_GETBIT_B:
-	case OPC_CBITS_B:
-		return IR3_REG_BNOT;
-
-	default:
-		return 0;
-	}
-}
-
-/* map cat3 instructions to valid abs/neg flags: */
-static inline unsigned ir3_cat3_absneg(opc_t opc)
-{
-	switch (opc) {
-	case OPC_MAD_F16:
-	case OPC_MAD_F32:
-	case OPC_SEL_F16:
-	case OPC_SEL_F32:
-		return IR3_REG_FNEG;
-
-	case OPC_MAD_U16:
-	case OPC_MADSH_U16:
-	case OPC_MAD_S16:
-	case OPC_MADSH_M16:
-	case OPC_MAD_U24:
-	case OPC_MAD_S24:
-	case OPC_SEL_S16:
-	case OPC_SEL_S32:
-	case OPC_SAD_S16:
-	case OPC_SAD_S32:
-		/* neg *may* work on 3rd src.. */
-
-	case OPC_SEL_B16:
-	case OPC_SEL_B32:
-
-	default:
-		return 0;
-	}
-}
-
-#define MASK(n) ((1 << (n)) - 1)
-
-/* iterator for an instructions's sources (reg), also returns src #: */
-#define foreach_src_n(__srcreg, __n, __instr) \
-	if ((__instr)->regs_count) \
-		for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
-			if ((__srcreg = (__instr)->regs[__n + 1]))
-
-/* iterator for an instructions's sources (reg): */
-#define foreach_src(__srcreg, __instr) \
-	foreach_src_n(__srcreg, __i, __instr)
-
-static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
-{
-	unsigned cnt = instr->regs_count + instr->deps_count;
-	if (instr->address)
-		cnt++;
-	return cnt;
-}
-
-static inline struct ir3_instruction **
-__ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
-{
-	if (n == (instr->regs_count + instr->deps_count))
-		return &instr->address;
-	if (n >= instr->regs_count)
-		return &instr->deps[n - instr->regs_count];
-	if (ssa(instr->regs[n]))
-		return &instr->regs[n]->instr;
-	return NULL;
-}
-
-static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
-{
-	if (n == (instr->regs_count + instr->deps_count))
-		return false;
-	if (n >= instr->regs_count)
-		return true;
-	return false;
-}
-
-#define foreach_ssa_srcp_n(__srcp, __n, __instr) \
-	for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
-		for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
-			if ((__srcp = __ssa_srcp_n(__instr, __n)))
-
-#define foreach_ssa_srcp(__srcp, __instr) \
-	foreach_ssa_srcp_n(__srcp, __i, __instr)
-
-/* iterator for an instruction's SSA sources (instr), also returns src #: */
-#define foreach_ssa_src_n(__srcinst, __n, __instr) \
-	foreach_ssa_srcp_n(__srcp, __n, __instr) \
-		if ((__srcinst = *__srcp))
-
-/* iterator for an instruction's SSA sources (instr): */
-#define foreach_ssa_src(__srcinst, __instr) \
-	foreach_ssa_src_n(__srcinst, __i, __instr)
-
-/* iterators for shader inputs: */
-#define foreach_input_n(__ininstr, __cnt, __ir) \
-	for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
-		if ((__ininstr = (__ir)->inputs[__cnt]))
-#define foreach_input(__ininstr, __ir) \
-	foreach_input_n(__ininstr, __i, __ir)
-
-/* iterators for shader outputs: */
-#define foreach_output_n(__outinstr, __cnt, __ir) \
-	for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
-		if ((__outinstr = (__ir)->outputs[__cnt]))
-#define foreach_output(__outinstr, __ir) \
-	foreach_output_n(__outinstr, __i, __ir)
-
-/* iterators for instructions: */
-#define foreach_instr(__instr, __list) \
-	list_for_each_entry(struct ir3_instruction, __instr, __list, node)
-#define foreach_instr_rev(__instr, __list) \
-	list_for_each_entry_rev(struct ir3_instruction, __instr, __list, node)
-#define foreach_instr_safe(__instr, __list) \
-	list_for_each_entry_safe(struct ir3_instruction, __instr, __list, node)
-
-/* iterators for blocks: */
-#define foreach_block(__block, __list) \
-	list_for_each_entry(struct ir3_block, __block, __list, node)
-#define foreach_block_safe(__block, __list) \
-	list_for_each_entry_safe(struct ir3_block, __block, __list, node)
-#define foreach_block_rev(__block, __list) \
-	list_for_each_entry_rev(struct ir3_block, __block, __list, node)
-
-/* iterators for arrays: */
-#define foreach_array(__array, __list) \
-	list_for_each_entry(struct ir3_array, __array, __list, node)
-
-/* Check if condition is true for any src instruction.
- */
-static inline bool
-check_src_cond(struct ir3_instruction *instr, bool (*cond)(struct ir3_instruction *))
-{
-	struct ir3_register *reg;
-
-	/* Note that this is also used post-RA so skip the ssa iterator: */
-	foreach_src (reg, instr) {
-		struct ir3_instruction *src = reg->instr;
-
-		if (!src)
-			continue;
-
-		/* meta:split/collect aren't real instructions, the thing that
-		 * we actually care about is *their* srcs
-		 */
-		if ((src->opc == OPC_META_SPLIT) || (src->opc == OPC_META_COLLECT)) {
-			if (check_src_cond(src, cond))
-				return true;
-		} else {
-			if (cond(src))
-				return true;
-		}
-	}
-
-	return false;
-}
-
-/* dump: */
-void ir3_print(struct ir3 *ir);
-void ir3_print_instr(struct ir3_instruction *instr);
-
-/* delay calculation: */
-int ir3_delayslots(struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer, unsigned n, bool soft);
-unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred);
-void ir3_remove_nops(struct ir3 *ir);
-
-/* dead code elimination: */
-struct ir3_shader_variant;
-void ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
-
-/* fp16 conversion folding */
-void ir3_cf(struct ir3 *ir);
-
-/* copy-propagate: */
-void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
-
-/* group neighbors and insert mov's to resolve conflicts: */
-void ir3_group(struct ir3 *ir);
-
-/* Sethi–Ullman numbering: */
-void ir3_sun(struct ir3 *ir);
-
-/* scheduling: */
-void ir3_sched_add_deps(struct ir3 *ir);
-int ir3_sched(struct ir3 *ir);
-
-struct ir3_context;
-int ir3_postsched(struct ir3_context *ctx);
-
-bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
-
-/* register assignment: */
-struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
-int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor);
-
-/* legalize: */
-void ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
-
-static inline bool
-ir3_has_latency_to_hide(struct ir3 *ir)
-{
-	/* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
-	 * know the nature of the fragment shader.  Just assume it will have
-	 * latency to hide:
-	 */
-	if (ir->type != MESA_SHADER_FRAGMENT)
-		return true;
-
-	foreach_block (block, &ir->block_list) {
-		foreach_instr (instr, &block->instr_list) {
-			if (is_tex_or_prefetch(instr))
-				return true;
-
-			if (is_load(instr)) {
-				switch (instr->opc) {
-				case OPC_LDLV:
-				case OPC_LDL:
-				case OPC_LDLW:
-					break;
-				default:
-					return true;
-				}
-			}
-		}
-	}
-
-	return false;
-}
-
-/* ************************************************************************* */
-/* instruction helpers */
-
-/* creates SSA src of correct type (ie. half vs full precision) */
-static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
-		struct ir3_instruction *src, unsigned flags)
-{
-	struct ir3_register *reg;
-	if (src->regs[0]->flags & IR3_REG_HALF)
-		flags |= IR3_REG_HALF;
-	reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
-	reg->instr = src;
-	reg->wrmask = src->regs[0]->wrmask;
-	return reg;
-}
-
-static inline struct ir3_register * __ssa_dst(struct ir3_instruction *instr)
-{
-	struct ir3_register *reg = ir3_reg_create(instr, 0, 0);
-	reg->flags |= IR3_REG_SSA;
-	return reg;
-}
-
-static inline struct ir3_instruction *
-create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
-{
-	struct ir3_instruction *mov;
-	unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = type;
-	mov->cat1.dst_type = type;
-	__ssa_dst(mov)->flags |= flags;
-	ir3_reg_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
-
-	return mov;
-}
-
-static inline struct ir3_instruction *
-create_immed(struct ir3_block *block, uint32_t val)
-{
-	return create_immed_typed(block, val, TYPE_U32);
-}
-
-static inline struct ir3_instruction *
-create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
-{
-	struct ir3_instruction *mov;
-	unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = type;
-	mov->cat1.dst_type = type;
-	__ssa_dst(mov)->flags |= flags;
-	ir3_reg_create(mov, n, IR3_REG_CONST | flags);
-
-	return mov;
-}
-
-static inline struct ir3_instruction *
-create_uniform(struct ir3_block *block, unsigned n)
-{
-	return create_uniform_typed(block, n, TYPE_F32);
-}
-
-static inline struct ir3_instruction *
-create_uniform_indirect(struct ir3_block *block, int n,
-		struct ir3_instruction *address)
-{
-	struct ir3_instruction *mov;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-	__ssa_dst(mov);
-	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
-
-	ir3_instr_set_address(mov, address);
-
-	return mov;
-}
-
-static inline struct ir3_instruction *
-ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
-{
-	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
-	__ssa_dst(instr);
-	if (src->regs[0]->flags & IR3_REG_ARRAY) {
-		struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
-		src_reg->array = src->regs[0]->array;
-	} else {
-		__ssa_src(instr, src, src->regs[0]->flags & IR3_REG_HIGH);
-	}
-	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
-	instr->cat1.src_type = type;
-	instr->cat1.dst_type = type;
-	return instr;
-}
-
-static inline struct ir3_instruction *
-ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
-		type_t src_type, type_t dst_type)
-{
-	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
-	unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
-	unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
-
-	debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
-
-	__ssa_dst(instr)->flags |= dst_flags;
-	__ssa_src(instr, src, 0);
-	instr->cat1.src_type = src_type;
-	instr->cat1.dst_type = dst_type;
-	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
-	return instr;
-}
-
-static inline struct ir3_instruction *
-ir3_NOP(struct ir3_block *block)
-{
-	return ir3_instr_create(block, OPC_NOP);
-}
-
-#define IR3_INSTR_0 0
-
-#define __INSTR0(flag, name, opc)                                        \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block)                                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, opc);                                    \
-	instr->flags |= flag;                                                \
-	return instr;                                                        \
-}
-#define INSTR0F(f, name)    __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
-#define INSTR0(name)        __INSTR0(0, name, OPC_##name)
-
-#define __INSTR1(flag, name, opc)                                        \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, opc);                                    \
-	__ssa_dst(instr);                                                    \
-	__ssa_src(instr, a, aflags);                                         \
-	instr->flags |= flag;                                                \
-	return instr;                                                        \
-}
-#define INSTR1F(f, name)    __INSTR1(IR3_INSTR_##f, name##_##f, OPC_##name)
-#define INSTR1(name)        __INSTR1(0, name, OPC_##name)
-
-#define __INSTR2(flag, name, opc)                                        \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags,                      \
-		struct ir3_instruction *b, unsigned bflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, opc);                                    \
-	__ssa_dst(instr);                                                    \
-	__ssa_src(instr, a, aflags);                                         \
-	__ssa_src(instr, b, bflags);                                         \
-	instr->flags |= flag;                                                \
-	return instr;                                                        \
-}
-#define INSTR2F(f, name)    __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name)
-#define INSTR2(name)        __INSTR2(0, name, OPC_##name)
-
-#define __INSTR3(flag, name, opc)                                        \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags,                      \
-		struct ir3_instruction *b, unsigned bflags,                      \
-		struct ir3_instruction *c, unsigned cflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create2(block, opc, 4);                                \
-	__ssa_dst(instr);                                                    \
-	__ssa_src(instr, a, aflags);                                         \
-	__ssa_src(instr, b, bflags);                                         \
-	__ssa_src(instr, c, cflags);                                         \
-	instr->flags |= flag;                                                \
-	return instr;                                                        \
-}
-#define INSTR3F(f, name)    __INSTR3(IR3_INSTR_##f, name##_##f, OPC_##name)
-#define INSTR3(name)        __INSTR3(0, name, OPC_##name)
-
-#define __INSTR4(flag, name, opc)                                        \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags,                      \
-		struct ir3_instruction *b, unsigned bflags,                      \
-		struct ir3_instruction *c, unsigned cflags,                      \
-		struct ir3_instruction *d, unsigned dflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create2(block, opc, 5);                                \
-	__ssa_dst(instr);                                                    \
-	__ssa_src(instr, a, aflags);                                         \
-	__ssa_src(instr, b, bflags);                                         \
-	__ssa_src(instr, c, cflags);                                         \
-	__ssa_src(instr, d, dflags);                                         \
-	instr->flags |= flag;                                                \
-	return instr;                                                        \
-}
-#define INSTR4F(f, name)    __INSTR4(IR3_INSTR_##f, name##_##f, OPC_##name)
-#define INSTR4(name)        __INSTR4(0, name, OPC_##name)
-
-/* cat0 instructions: */
-INSTR1(B)
-INSTR0(JUMP)
-INSTR1(KILL)
-INSTR0(END)
-INSTR0(CHSH)
-INSTR0(CHMASK)
-INSTR1(PREDT)
-INSTR0(PREDF)
-INSTR0(PREDE)
-
-/* cat2 instructions, most 2 src but some 1 src: */
-INSTR2(ADD_F)
-INSTR2(MIN_F)
-INSTR2(MAX_F)
-INSTR2(MUL_F)
-INSTR1(SIGN_F)
-INSTR2(CMPS_F)
-INSTR1(ABSNEG_F)
-INSTR2(CMPV_F)
-INSTR1(FLOOR_F)
-INSTR1(CEIL_F)
-INSTR1(RNDNE_F)
-INSTR1(RNDAZ_F)
-INSTR1(TRUNC_F)
-INSTR2(ADD_U)
-INSTR2(ADD_S)
-INSTR2(SUB_U)
-INSTR2(SUB_S)
-INSTR2(CMPS_U)
-INSTR2(CMPS_S)
-INSTR2(MIN_U)
-INSTR2(MIN_S)
-INSTR2(MAX_U)
-INSTR2(MAX_S)
-INSTR1(ABSNEG_S)
-INSTR2(AND_B)
-INSTR2(OR_B)
-INSTR1(NOT_B)
-INSTR2(XOR_B)
-INSTR2(CMPV_U)
-INSTR2(CMPV_S)
-INSTR2(MUL_U24)
-INSTR2(MUL_S24)
-INSTR2(MULL_U)
-INSTR1(BFREV_B)
-INSTR1(CLZ_S)
-INSTR1(CLZ_B)
-INSTR2(SHL_B)
-INSTR2(SHR_B)
-INSTR2(ASHR_B)
-INSTR2(BARY_F)
-INSTR2(MGEN_B)
-INSTR2(GETBIT_B)
-INSTR1(SETRM)
-INSTR1(CBITS_B)
-INSTR2(SHB)
-INSTR2(MSAD)
-
-/* cat3 instructions: */
-INSTR3(MAD_U16)
-INSTR3(MADSH_U16)
-INSTR3(MAD_S16)
-INSTR3(MADSH_M16)
-INSTR3(MAD_U24)
-INSTR3(MAD_S24)
-INSTR3(MAD_F16)
-INSTR3(MAD_F32)
-/* NOTE: SEL_B32 checks for zero vs nonzero */
-INSTR3(SEL_B16)
-INSTR3(SEL_B32)
-INSTR3(SEL_S16)
-INSTR3(SEL_S32)
-INSTR3(SEL_F16)
-INSTR3(SEL_F32)
-INSTR3(SAD_S16)
-INSTR3(SAD_S32)
-
-/* cat4 instructions: */
-INSTR1(RCP)
-INSTR1(RSQ)
-INSTR1(HRSQ)
-INSTR1(LOG2)
-INSTR1(HLOG2)
-INSTR1(EXP2)
-INSTR1(HEXP2)
-INSTR1(SIN)
-INSTR1(COS)
-INSTR1(SQRT)
-
-/* cat5 instructions: */
-INSTR1(DSX)
-INSTR1(DSXPP_1)
-INSTR1(DSY)
-INSTR1(DSYPP_1)
-INSTR1F(3D, DSX)
-INSTR1F(3D, DSY)
-INSTR1(RGETPOS)
-
-static inline struct ir3_instruction *
-ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
-		unsigned wrmask, unsigned flags, struct ir3_instruction *samp_tex,
-		struct ir3_instruction *src0, struct ir3_instruction *src1)
-{
-	struct ir3_instruction *sam;
-
-	sam = ir3_instr_create(block, opc);
-	sam->flags |= flags;
-	__ssa_dst(sam)->wrmask = wrmask;
-	if (flags & IR3_INSTR_S2EN) {
-		__ssa_src(sam, samp_tex, IR3_REG_HALF);
-	}
-	if (src0) {
-		__ssa_src(sam, src0, 0);
-	}
-	if (src1) {
-		__ssa_src(sam, src1, 0);
-	}
-	sam->cat5.type  = type;
-
-	return sam;
-}
-
-/* cat6 instructions: */
-INSTR2(LDLV)
-INSTR3(LDG)
-INSTR3(LDL)
-INSTR3(LDLW)
-INSTR3(STG)
-INSTR3(STL)
-INSTR3(STLW)
-INSTR1(RESINFO)
-INSTR1(RESFMT)
-INSTR2(ATOMIC_ADD)
-INSTR2(ATOMIC_SUB)
-INSTR2(ATOMIC_XCHG)
-INSTR2(ATOMIC_INC)
-INSTR2(ATOMIC_DEC)
-INSTR2(ATOMIC_CMPXCHG)
-INSTR2(ATOMIC_MIN)
-INSTR2(ATOMIC_MAX)
-INSTR2(ATOMIC_AND)
-INSTR2(ATOMIC_OR)
-INSTR2(ATOMIC_XOR)
-INSTR2(LDC)
-#if GPU >= 600
-INSTR3(STIB);
-INSTR2(LDIB);
-INSTR3F(G, ATOMIC_ADD)
-INSTR3F(G, ATOMIC_SUB)
-INSTR3F(G, ATOMIC_XCHG)
-INSTR3F(G, ATOMIC_INC)
-INSTR3F(G, ATOMIC_DEC)
-INSTR3F(G, ATOMIC_CMPXCHG)
-INSTR3F(G, ATOMIC_MIN)
-INSTR3F(G, ATOMIC_MAX)
-INSTR3F(G, ATOMIC_AND)
-INSTR3F(G, ATOMIC_OR)
-INSTR3F(G, ATOMIC_XOR)
-#elif GPU >= 400
-INSTR3(LDGB)
-INSTR4(STGB)
-INSTR4(STIB)
-INSTR4F(G, ATOMIC_ADD)
-INSTR4F(G, ATOMIC_SUB)
-INSTR4F(G, ATOMIC_XCHG)
-INSTR4F(G, ATOMIC_INC)
-INSTR4F(G, ATOMIC_DEC)
-INSTR4F(G, ATOMIC_CMPXCHG)
-INSTR4F(G, ATOMIC_MIN)
-INSTR4F(G, ATOMIC_MAX)
-INSTR4F(G, ATOMIC_AND)
-INSTR4F(G, ATOMIC_OR)
-INSTR4F(G, ATOMIC_XOR)
-#endif
-
-INSTR4F(G, STG)
-
-/* cat7 instructions: */
-INSTR0(BAR)
-INSTR0(FENCE)
-
-/* meta instructions: */
-INSTR0(META_TEX_PREFETCH);
-
-/* ************************************************************************* */
-/* split this out or find some helper to use.. like main/bitset.h.. */
-
-#include <string.h>
-#include "util/bitset.h"
-
-#define MAX_REG 256
-
-typedef BITSET_DECLARE(regmask_t, 2 * MAX_REG);
-
-static inline bool
-__regmask_get(regmask_t *regmask, struct ir3_register *reg, unsigned n)
-{
-	if (reg->merged) {
-		/* a6xx+ case, with merged register file, we track things in terms
-		 * of half-precision registers, with a full precisions register
-		 * using two half-precision slots:
-		 */
-		if (reg->flags & IR3_REG_HALF) {
-			return BITSET_TEST(*regmask, n);
-		} else {
-			n *= 2;
-			return BITSET_TEST(*regmask, n) || BITSET_TEST(*regmask, n+1);
-		}
-	} else {
-		/* pre a6xx case, with separate register file for half and full
-		 * precision:
-		 */
-		if (reg->flags & IR3_REG_HALF)
-			n += MAX_REG;
-		return BITSET_TEST(*regmask, n);
-	}
-}
-
-static inline void
-__regmask_set(regmask_t *regmask, struct ir3_register *reg, unsigned n)
-{
-	if (reg->merged) {
-		/* a6xx+ case, with merged register file, we track things in terms
-		 * of half-precision registers, with a full precisions register
-		 * using two half-precision slots:
-		 */
-		if (reg->flags & IR3_REG_HALF) {
-			BITSET_SET(*regmask, n);
-		} else {
-			n *= 2;
-			BITSET_SET(*regmask, n);
-			BITSET_SET(*regmask, n+1);
-		}
-	} else {
-		/* pre a6xx case, with separate register file for half and full
-		 * precision:
-		 */
-		if (reg->flags & IR3_REG_HALF)
-			n += MAX_REG;
-		BITSET_SET(*regmask, n);
-	}
-}
-
-static inline void regmask_init(regmask_t *regmask)
-{
-	memset(regmask, 0, sizeof(*regmask));
-}
-
-static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
-{
-	if (reg->flags & IR3_REG_RELATIV) {
-		for (unsigned i = 0; i < reg->size; i++)
-			__regmask_set(regmask, reg, reg->array.offset + i);
-	} else {
-		for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
-			if (mask & 1)
-				__regmask_set(regmask, reg, n);
-	}
-}
-
-static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
-{
-	unsigned i;
-	for (i = 0; i < ARRAY_SIZE(*dst); i++)
-		(*dst)[i] = (*a)[i] | (*b)[i];
-}
-
-static inline bool regmask_get(regmask_t *regmask,
-		struct ir3_register *reg)
-{
-	if (reg->flags & IR3_REG_RELATIV) {
-		for (unsigned i = 0; i < reg->size; i++)
-			if (__regmask_get(regmask, reg, reg->array.offset + i))
-				return true;
-	} else {
-		for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
-			if (mask & 1)
-				if (__regmask_get(regmask, reg, n))
-					return true;
-	}
-	return false;
-}
-
-/* ************************************************************************* */
-
-#endif /* IR3_H_ */
diff --git a/extra/disassemblers/adreno/shader_enums.h b/extra/disassemblers/adreno/shader_enums.h
deleted file mode 100644
index b33a91727a..0000000000
--- a/extra/disassemblers/adreno/shader_enums.h
+++ /dev/null
@@ -1,906 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
- * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef SHADER_ENUMS_H
-#define SHADER_ENUMS_H
-
-#include <stdbool.h>
-
-/* Project-wide (GL and Vulkan) maximum. */
-#define MAX_DRAW_BUFFERS 8
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Shader stages.
- *
- * The order must match how shaders are ordered in the pipeline.
- * The GLSL linker assumes that if i<j, then the j-th shader is
- * executed later than the i-th shader.
- */
-typedef enum
-{
-   MESA_SHADER_NONE = -1,
-   MESA_SHADER_VERTEX = 0,
-   MESA_SHADER_TESS_CTRL = 1,
-   MESA_SHADER_TESS_EVAL = 2,
-   MESA_SHADER_GEOMETRY = 3,
-   MESA_SHADER_FRAGMENT = 4,
-   MESA_SHADER_COMPUTE = 5,
-   /* must be last so it doesn't affect the GL pipeline */
-   MESA_SHADER_KERNEL = 6,
-} gl_shader_stage;
-
-static inline bool
-gl_shader_stage_is_compute(gl_shader_stage stage)
-{
-   return stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL;
-}
-
-/**
- * Number of STATE_* values we need to address any GL state.
- * Used to dimension arrays.
- */
-#define STATE_LENGTH 5
-
-typedef short gl_state_index16; /* see enum gl_state_index */
-
-const char *gl_shader_stage_name(gl_shader_stage stage);
-
-/**
- * Translate a gl_shader_stage to a short shader stage name for debug
- * printouts and error messages.
- */
-const char *_mesa_shader_stage_to_string(unsigned stage);
-
-/**
- * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
- * for debug printouts and error messages.
- */
-const char *_mesa_shader_stage_to_abbrev(unsigned stage);
-
-/**
- * GL related stages (not including CL)
- */
-#define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
-
-/**
- * All stages
- */
-#define MESA_ALL_SHADER_STAGES (MESA_SHADER_KERNEL + 1)
-
-
-/**
- * Indexes for vertex program attributes.
- * GL_NV_vertex_program aliases generic attributes over the conventional
- * attributes.  In GL_ARB_vertex_program shader the aliasing is optional.
- * In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
- * generic attributes are distinct/separate).
- */
-typedef enum
-{
-   VERT_ATTRIB_POS,
-   VERT_ATTRIB_NORMAL,
-   VERT_ATTRIB_COLOR0,
-   VERT_ATTRIB_COLOR1,
-   VERT_ATTRIB_FOG,
-   VERT_ATTRIB_COLOR_INDEX,
-   VERT_ATTRIB_EDGEFLAG,
-   VERT_ATTRIB_TEX0,
-   VERT_ATTRIB_TEX1,
-   VERT_ATTRIB_TEX2,
-   VERT_ATTRIB_TEX3,
-   VERT_ATTRIB_TEX4,
-   VERT_ATTRIB_TEX5,
-   VERT_ATTRIB_TEX6,
-   VERT_ATTRIB_TEX7,
-   VERT_ATTRIB_POINT_SIZE,
-   VERT_ATTRIB_GENERIC0,
-   VERT_ATTRIB_GENERIC1,
-   VERT_ATTRIB_GENERIC2,
-   VERT_ATTRIB_GENERIC3,
-   VERT_ATTRIB_GENERIC4,
-   VERT_ATTRIB_GENERIC5,
-   VERT_ATTRIB_GENERIC6,
-   VERT_ATTRIB_GENERIC7,
-   VERT_ATTRIB_GENERIC8,
-   VERT_ATTRIB_GENERIC9,
-   VERT_ATTRIB_GENERIC10,
-   VERT_ATTRIB_GENERIC11,
-   VERT_ATTRIB_GENERIC12,
-   VERT_ATTRIB_GENERIC13,
-   VERT_ATTRIB_GENERIC14,
-   VERT_ATTRIB_GENERIC15,
-   VERT_ATTRIB_MAX
-} gl_vert_attrib;
-
-const char *gl_vert_attrib_name(gl_vert_attrib attrib);
-
-/**
- * Symbolic constats to help iterating over
- * specific blocks of vertex attributes.
- *
- * VERT_ATTRIB_FF
- *   includes all fixed function attributes as well as
- *   the aliased GL_NV_vertex_program shader attributes.
- * VERT_ATTRIB_TEX
- *   include the classic texture coordinate attributes.
- *   Is a subset of VERT_ATTRIB_FF.
- * VERT_ATTRIB_GENERIC
- *   include the OpenGL 2.0+ GLSL generic shader attributes.
- *   These alias the generic GL_ARB_vertex_shader attributes.
- * VERT_ATTRIB_MAT
- *   include the generic shader attributes used to alias
- *   varying material values for the TNL shader programs.
- *   They are located at the end of the generic attribute
- *   block not to overlap with the generic 0 attribute.
- */
-#define VERT_ATTRIB_FF(i)           (VERT_ATTRIB_POS + (i))
-#define VERT_ATTRIB_FF_MAX          VERT_ATTRIB_GENERIC0
-
-#define VERT_ATTRIB_TEX(i)          (VERT_ATTRIB_TEX0 + (i))
-#define VERT_ATTRIB_TEX_MAX         MAX_TEXTURE_COORD_UNITS
-
-#define VERT_ATTRIB_GENERIC(i)      (VERT_ATTRIB_GENERIC0 + (i))
-#define VERT_ATTRIB_GENERIC_MAX     MAX_VERTEX_GENERIC_ATTRIBS
-
-#define VERT_ATTRIB_MAT0            \
-   (VERT_ATTRIB_GENERIC_MAX - VERT_ATTRIB_MAT_MAX)
-#define VERT_ATTRIB_MAT(i)          \
-   VERT_ATTRIB_GENERIC((i) + VERT_ATTRIB_MAT0)
-#define VERT_ATTRIB_MAT_MAX         MAT_ATTRIB_MAX
-
-/**
- * Bitflags for vertex attributes.
- * These are used in bitfields in many places.
- */
-/*@{*/
-#define VERT_BIT_POS             BITFIELD_BIT(VERT_ATTRIB_POS)
-#define VERT_BIT_NORMAL          BITFIELD_BIT(VERT_ATTRIB_NORMAL)
-#define VERT_BIT_COLOR0          BITFIELD_BIT(VERT_ATTRIB_COLOR0)
-#define VERT_BIT_COLOR1          BITFIELD_BIT(VERT_ATTRIB_COLOR1)
-#define VERT_BIT_FOG             BITFIELD_BIT(VERT_ATTRIB_FOG)
-#define VERT_BIT_COLOR_INDEX     BITFIELD_BIT(VERT_ATTRIB_COLOR_INDEX)
-#define VERT_BIT_EDGEFLAG        BITFIELD_BIT(VERT_ATTRIB_EDGEFLAG)
-#define VERT_BIT_TEX0            BITFIELD_BIT(VERT_ATTRIB_TEX0)
-#define VERT_BIT_TEX1            BITFIELD_BIT(VERT_ATTRIB_TEX1)
-#define VERT_BIT_TEX2            BITFIELD_BIT(VERT_ATTRIB_TEX2)
-#define VERT_BIT_TEX3            BITFIELD_BIT(VERT_ATTRIB_TEX3)
-#define VERT_BIT_TEX4            BITFIELD_BIT(VERT_ATTRIB_TEX4)
-#define VERT_BIT_TEX5            BITFIELD_BIT(VERT_ATTRIB_TEX5)
-#define VERT_BIT_TEX6            BITFIELD_BIT(VERT_ATTRIB_TEX6)
-#define VERT_BIT_TEX7            BITFIELD_BIT(VERT_ATTRIB_TEX7)
-#define VERT_BIT_POINT_SIZE      BITFIELD_BIT(VERT_ATTRIB_POINT_SIZE)
-#define VERT_BIT_GENERIC0        BITFIELD_BIT(VERT_ATTRIB_GENERIC0)
-
-#define VERT_BIT(i)              BITFIELD_BIT(i)
-#define VERT_BIT_ALL             BITFIELD_RANGE(0, VERT_ATTRIB_MAX)
-
-#define VERT_BIT_FF(i)           VERT_BIT(i)
-#define VERT_BIT_FF_ALL          BITFIELD_RANGE(0, VERT_ATTRIB_FF_MAX)
-#define VERT_BIT_TEX(i)          VERT_BIT(VERT_ATTRIB_TEX(i))
-#define VERT_BIT_TEX_ALL         \
-   BITFIELD_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)
-
-#define VERT_BIT_GENERIC(i)      VERT_BIT(VERT_ATTRIB_GENERIC(i))
-#define VERT_BIT_GENERIC_ALL     \
-   BITFIELD_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
-
-#define VERT_BIT_MAT(i)	         VERT_BIT(VERT_ATTRIB_MAT(i))
-#define VERT_BIT_MAT_ALL         \
-   BITFIELD_RANGE(VERT_ATTRIB_MAT(0), VERT_ATTRIB_MAT_MAX)
-/*@}*/
-
-#define MAX_VARYING 32 /**< number of float[4] vectors */
-
-/**
- * Indexes for vertex shader outputs, geometry shader inputs/outputs, and
- * fragment shader inputs.
- *
- * Note that some of these values are not available to all pipeline stages.
- *
- * When this enum is updated, the following code must be updated too:
- * - vertResults (in prog_print.c's arb_output_attrib_string())
- * - fragAttribs (in prog_print.c's arb_input_attrib_string())
- * - _mesa_varying_slot_in_fs()
- */
-typedef enum
-{
-   VARYING_SLOT_POS,
-   VARYING_SLOT_COL0, /* COL0 and COL1 must be contiguous */
-   VARYING_SLOT_COL1,
-   VARYING_SLOT_FOGC,
-   VARYING_SLOT_TEX0, /* TEX0-TEX7 must be contiguous */
-   VARYING_SLOT_TEX1,
-   VARYING_SLOT_TEX2,
-   VARYING_SLOT_TEX3,
-   VARYING_SLOT_TEX4,
-   VARYING_SLOT_TEX5,
-   VARYING_SLOT_TEX6,
-   VARYING_SLOT_TEX7,
-   VARYING_SLOT_PSIZ, /* Does not appear in FS */
-   VARYING_SLOT_BFC0, /* Does not appear in FS */
-   VARYING_SLOT_BFC1, /* Does not appear in FS */
-   VARYING_SLOT_EDGE, /* Does not appear in FS */
-   VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */
-   VARYING_SLOT_CLIP_DIST0,
-   VARYING_SLOT_CLIP_DIST1,
-   VARYING_SLOT_CULL_DIST0,
-   VARYING_SLOT_CULL_DIST1,
-   VARYING_SLOT_PRIMITIVE_ID, /* Does not appear in VS */
-   VARYING_SLOT_LAYER, /* Appears as VS or GS output */
-   VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
-   VARYING_SLOT_FACE, /* FS only */
-   VARYING_SLOT_PNTC, /* FS only */
-   VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
-   VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
-   VARYING_SLOT_BOUNDING_BOX0, /* Only appears as TCS output. */
-   VARYING_SLOT_BOUNDING_BOX1, /* Only appears as TCS output. */
-   VARYING_SLOT_VIEW_INDEX,
-   VARYING_SLOT_VIEWPORT_MASK, /* Does not appear in FS */
-   VARYING_SLOT_VAR0, /* First generic varying slot */
-   /* the remaining are simply for the benefit of gl_varying_slot_name()
-    * and not to be construed as an upper bound:
-    */
-   VARYING_SLOT_VAR1,
-   VARYING_SLOT_VAR2,
-   VARYING_SLOT_VAR3,
-   VARYING_SLOT_VAR4,
-   VARYING_SLOT_VAR5,
-   VARYING_SLOT_VAR6,
-   VARYING_SLOT_VAR7,
-   VARYING_SLOT_VAR8,
-   VARYING_SLOT_VAR9,
-   VARYING_SLOT_VAR10,
-   VARYING_SLOT_VAR11,
-   VARYING_SLOT_VAR12,
-   VARYING_SLOT_VAR13,
-   VARYING_SLOT_VAR14,
-   VARYING_SLOT_VAR15,
-   VARYING_SLOT_VAR16,
-   VARYING_SLOT_VAR17,
-   VARYING_SLOT_VAR18,
-   VARYING_SLOT_VAR19,
-   VARYING_SLOT_VAR20,
-   VARYING_SLOT_VAR21,
-   VARYING_SLOT_VAR22,
-   VARYING_SLOT_VAR23,
-   VARYING_SLOT_VAR24,
-   VARYING_SLOT_VAR25,
-   VARYING_SLOT_VAR26,
-   VARYING_SLOT_VAR27,
-   VARYING_SLOT_VAR28,
-   VARYING_SLOT_VAR29,
-   VARYING_SLOT_VAR30,
-   VARYING_SLOT_VAR31,
-} gl_varying_slot;
-
-
-#define VARYING_SLOT_MAX	(VARYING_SLOT_VAR0 + MAX_VARYING)
-#define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
-#define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
-#define MAX_VARYINGS_INCL_PATCH (VARYING_SLOT_TESS_MAX - VARYING_SLOT_VAR0)
-
-const char *gl_varying_slot_name(gl_varying_slot slot);
-
-/**
- * Bitflags for varying slots.
- */
-/*@{*/
-#define VARYING_BIT_POS BITFIELD64_BIT(VARYING_SLOT_POS)
-#define VARYING_BIT_COL0 BITFIELD64_BIT(VARYING_SLOT_COL0)
-#define VARYING_BIT_COL1 BITFIELD64_BIT(VARYING_SLOT_COL1)
-#define VARYING_BIT_FOGC BITFIELD64_BIT(VARYING_SLOT_FOGC)
-#define VARYING_BIT_TEX0 BITFIELD64_BIT(VARYING_SLOT_TEX0)
-#define VARYING_BIT_TEX1 BITFIELD64_BIT(VARYING_SLOT_TEX1)
-#define VARYING_BIT_TEX2 BITFIELD64_BIT(VARYING_SLOT_TEX2)
-#define VARYING_BIT_TEX3 BITFIELD64_BIT(VARYING_SLOT_TEX3)
-#define VARYING_BIT_TEX4 BITFIELD64_BIT(VARYING_SLOT_TEX4)
-#define VARYING_BIT_TEX5 BITFIELD64_BIT(VARYING_SLOT_TEX5)
-#define VARYING_BIT_TEX6 BITFIELD64_BIT(VARYING_SLOT_TEX6)
-#define VARYING_BIT_TEX7 BITFIELD64_BIT(VARYING_SLOT_TEX7)
-#define VARYING_BIT_TEX(U) BITFIELD64_BIT(VARYING_SLOT_TEX0 + (U))
-#define VARYING_BITS_TEX_ANY BITFIELD64_RANGE(VARYING_SLOT_TEX0, \
-                                              MAX_TEXTURE_COORD_UNITS)
-#define VARYING_BIT_PSIZ BITFIELD64_BIT(VARYING_SLOT_PSIZ)
-#define VARYING_BIT_BFC0 BITFIELD64_BIT(VARYING_SLOT_BFC0)
-#define VARYING_BIT_BFC1 BITFIELD64_BIT(VARYING_SLOT_BFC1)
-#define VARYING_BITS_COLOR (VARYING_BIT_COL0 | \
-                            VARYING_BIT_COL1 |        \
-                            VARYING_BIT_BFC0 |        \
-                            VARYING_BIT_BFC1)
-#define VARYING_BIT_EDGE BITFIELD64_BIT(VARYING_SLOT_EDGE)
-#define VARYING_BIT_CLIP_VERTEX BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX)
-#define VARYING_BIT_CLIP_DIST0 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)
-#define VARYING_BIT_CLIP_DIST1 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)
-#define VARYING_BIT_CULL_DIST0 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)
-#define VARYING_BIT_CULL_DIST1 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)
-#define VARYING_BIT_PRIMITIVE_ID BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID)
-#define VARYING_BIT_LAYER BITFIELD64_BIT(VARYING_SLOT_LAYER)
-#define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
-#define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
-#define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
-#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
-#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
-#define VARYING_BIT_BOUNDING_BOX0 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX0)
-#define VARYING_BIT_BOUNDING_BOX1 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX1)
-#define VARYING_BIT_VIEWPORT_MASK BITFIELD64_BIT(VARYING_SLOT_VIEWPORT_MASK)
-#define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
-/*@}*/
-
-/**
- * Bitflags for system values.
- */
-#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
-#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
-#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
-#define SYSTEM_BIT_LOCAL_INVOCATION_ID ((uint64_t)1 << SYSTEM_VALUE_LOCAL_INVOCATION_ID)
-
-/**
- * If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
- * one of these values.  If a NIR variable's mode is nir_var_system_value, it
- * will be one of these values.
- */
-typedef enum
-{
-   /**
-    * \name System values applicable to all shaders
-    */
-   /*@{*/
-
-   /**
-    * Builtin variables added by GL_ARB_shader_ballot.
-    */
-   /*@{*/
-
-   /**
-    * From the GL_ARB_shader-ballot spec:
-    *
-    *    "A sub-group is a collection of invocations which execute in lockstep.
-    *     The variable <gl_SubGroupSizeARB> is the maximum number of
-    *     invocations in a sub-group. The maximum <gl_SubGroupSizeARB>
-    *     supported in this extension is 64."
-    *
-    * The spec defines this as a uniform. However, it's highly unlikely that
-    * implementations actually treat it as a uniform (which is loaded from a
-    * constant buffer). Most likely, this is an implementation-wide constant,
-    * or perhaps something that depends on the shader stage.
-    */
-   SYSTEM_VALUE_SUBGROUP_SIZE,
-
-   /**
-    * From the GL_ARB_shader_ballot spec:
-    *
-    *    "The variable <gl_SubGroupInvocationARB> holds the index of the
-    *     invocation within sub-group. This variable is in the range 0 to
-    *     <gl_SubGroupSizeARB>-1, where <gl_SubGroupSizeARB> is the total
-    *     number of invocations in a sub-group."
-    */
-   SYSTEM_VALUE_SUBGROUP_INVOCATION,
-
-   /**
-    * From the GL_ARB_shader_ballot spec:
-    *
-    *    "The <gl_SubGroup??MaskARB> variables provide a bitmask for all
-    *     invocations, with one bit per invocation starting with the least
-    *     significant bit, according to the following table,
-    *
-    *       variable               equation for bit values
-    *       --------------------   ------------------------------------
-    *       gl_SubGroupEqMaskARB   bit index == gl_SubGroupInvocationARB
-    *       gl_SubGroupGeMaskARB   bit index >= gl_SubGroupInvocationARB
-    *       gl_SubGroupGtMaskARB   bit index >  gl_SubGroupInvocationARB
-    *       gl_SubGroupLeMaskARB   bit index <= gl_SubGroupInvocationARB
-    *       gl_SubGroupLtMaskARB   bit index <  gl_SubGroupInvocationARB
-    */
-   SYSTEM_VALUE_SUBGROUP_EQ_MASK,
-   SYSTEM_VALUE_SUBGROUP_GE_MASK,
-   SYSTEM_VALUE_SUBGROUP_GT_MASK,
-   SYSTEM_VALUE_SUBGROUP_LE_MASK,
-   SYSTEM_VALUE_SUBGROUP_LT_MASK,
-   /*@}*/
-
-   /**
-    * Builtin variables added by VK_KHR_subgroups
-    */
-   /*@{*/
-   SYSTEM_VALUE_NUM_SUBGROUPS,
-   SYSTEM_VALUE_SUBGROUP_ID,
-   /*@}*/
-
-   /*@}*/
-
-   /**
-    * \name Vertex shader system values
-    */
-   /*@{*/
-   /**
-    * OpenGL-style vertex ID.
-    *
-    * Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
-    * OpenGL 3.3 core profile spec says:
-    *
-    *     "gl_VertexID holds the integer index i implicitly passed by
-    *     DrawArrays or one of the other drawing commands defined in section
-    *     2.8.3."
-    *
-    * Section 2.8.3 (Drawing Commands) of the same spec says:
-    *
-    *     "The commands....are equivalent to the commands with the same base
-    *     name (without the BaseVertex suffix), except that the ith element
-    *     transferred by the corresponding draw call will be taken from
-    *     element indices[i] + basevertex of each enabled array."
-    *
-    * Additionally, the overview in the GL_ARB_shader_draw_parameters spec
-    * says:
-    *
-    *     "In unextended GL, vertex shaders have inputs named gl_VertexID and
-    *     gl_InstanceID, which contain, respectively the index of the vertex
-    *     and instance. The value of gl_VertexID is the implicitly passed
-    *     index of the vertex being processed, which includes the value of
-    *     baseVertex, for those commands that accept it."
-    *
-    * gl_VertexID gets basevertex added in.  This differs from DirectX where
-    * SV_VertexID does \b not get basevertex added in.
-    *
-    * \note
-    * If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
-    * equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
-    * \c SYSTEM_VALUE_BASE_VERTEX.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID,
-
-   /**
-    * Instanced ID as supplied to gl_InstanceID
-    *
-    * Values assigned to gl_InstanceID always begin with zero, regardless of
-    * the value of baseinstance.
-    *
-    * Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
-    * says:
-    *
-    *     "gl_InstanceID holds the integer instance number of the current
-    *     primitive in an instanced draw call (see section 10.5)."
-    *
-    * Through a big chain of pseudocode, section 10.5 describes that
-    * baseinstance is not counted by gl_InstanceID.  In that section, notice
-    *
-    *     "If an enabled vertex attribute array is instanced (it has a
-    *     non-zero divisor as specified by VertexAttribDivisor), the element
-    *     index that is transferred to the GL, for all vertices, is given by
-    *
-    *         floor(instance/divisor) + baseinstance
-    *
-    *     If an array corresponding to an attribute required by a vertex
-    *     shader is not enabled, then the corresponding element is taken from
-    *     the current attribute state (see section 10.2)."
-    *
-    * Note that baseinstance is \b not included in the value of instance.
-    */
-   SYSTEM_VALUE_INSTANCE_ID,
-
-   /**
-    * Vulkan InstanceIndex.
-    *
-    * InstanceIndex = gl_InstanceID + gl_BaseInstance
-    */
-   SYSTEM_VALUE_INSTANCE_INDEX,
-
-   /**
-    * DirectX-style vertex ID.
-    *
-    * Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
-    * the value of basevertex.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
-
-   /**
-    * Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
-    * functions.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
-    */
-   SYSTEM_VALUE_BASE_VERTEX,
-
-   /**
-    * Depending on the type of the draw call (indexed or non-indexed),
-    * is the value of \c basevertex passed to \c glDrawElementsBaseVertex and
-    * similar, or is the value of \c first passed to \c glDrawArrays and
-    * similar.
-    *
-    * \note
-    * It can be used to calculate the \c SYSTEM_VALUE_VERTEX_ID as
-    * \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus \c SYSTEM_VALUE_FIRST_VERTEX.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_VERTEX_ID
-    */
-   SYSTEM_VALUE_FIRST_VERTEX,
-
-   /**
-    * If the Draw command used to start the rendering was an indexed draw
-    * or not (~0/0). Useful to calculate \c SYSTEM_VALUE_BASE_VERTEX as
-    * \c SYSTEM_VALUE_IS_INDEXED_DRAW & \c SYSTEM_VALUE_FIRST_VERTEX.
-    */
-   SYSTEM_VALUE_IS_INDEXED_DRAW,
-
-   /**
-    * Value of \c baseinstance passed to instanced draw entry points
-    *
-    * \sa SYSTEM_VALUE_INSTANCE_ID
-    */
-   SYSTEM_VALUE_BASE_INSTANCE,
-
-   /**
-    * From _ARB_shader_draw_parameters:
-    *
-    *   "Additionally, this extension adds a further built-in variable,
-    *    gl_DrawID to the shading language. This variable contains the index
-    *    of the draw currently being processed by a Multi* variant of a
-    *    drawing command (such as MultiDrawElements or
-    *    MultiDrawArraysIndirect)."
-    *
-    * If GL_ARB_multi_draw_indirect is not supported, this is always 0.
-    */
-   SYSTEM_VALUE_DRAW_ID,
-   /*@}*/
-
-   /**
-    * \name Geometry shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_INVOCATION_ID,  /**< (Also in Tessellation Control shader) */
-   /*@}*/
-
-   /**
-    * \name Fragment shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_FRAG_COORD,
-   SYSTEM_VALUE_POINT_COORD,
-   SYSTEM_VALUE_FRONT_FACE,
-   SYSTEM_VALUE_SAMPLE_ID,
-   SYSTEM_VALUE_SAMPLE_POS,
-   SYSTEM_VALUE_SAMPLE_MASK_IN,
-   SYSTEM_VALUE_HELPER_INVOCATION,
-   SYSTEM_VALUE_COLOR0,
-   SYSTEM_VALUE_COLOR1,
-   /*@}*/
-
-   /**
-    * \name Tessellation Evaluation shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_TESS_COORD,
-   SYSTEM_VALUE_VERTICES_IN,    /**< Tessellation vertices in input patch */
-   SYSTEM_VALUE_PRIMITIVE_ID,
-   SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
-   SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
-   SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT, /**< TCS input for passthru TCS */
-   SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT, /**< TCS input for passthru TCS */
-   /*@}*/
-
-   /**
-    * \name Compute shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_LOCAL_INVOCATION_ID,
-   SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
-   SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
-   SYSTEM_VALUE_GLOBAL_INVOCATION_INDEX,
-   SYSTEM_VALUE_WORK_GROUP_ID,
-   SYSTEM_VALUE_NUM_WORK_GROUPS,
-   SYSTEM_VALUE_LOCAL_GROUP_SIZE,
-   SYSTEM_VALUE_GLOBAL_GROUP_SIZE,
-   SYSTEM_VALUE_WORK_DIM,
-   SYSTEM_VALUE_USER_DATA_AMD,
-   /*@}*/
-
-   /** Required for VK_KHR_device_group */
-   SYSTEM_VALUE_DEVICE_INDEX,
-
-   /** Required for VK_KHX_multiview */
-   SYSTEM_VALUE_VIEW_INDEX,
-
-   /**
-    * Driver internal vertex-count, used (for example) for drivers to
-    * calculate stride for stream-out outputs.  Not externally visible.
-    */
-   SYSTEM_VALUE_VERTEX_CNT,
-
-   /**
-    * Required for AMD_shader_explicit_vertex_parameter and also used for
-    * varying-fetch instructions.
-    *
-    * The _SIZE value is "primitive size", used to scale i/j in primitive
-    * space to pixel space.
-    */
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID,
-   SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID,
-   SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE,
-   SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL,
-
-   /**
-    * IR3 specific geometry shader and tesselation control shader system
-    * values that packs invocation id, thread id and vertex id.  Having this
-    * as a nir level system value lets us do the unpacking in nir.
-    */
-   SYSTEM_VALUE_GS_HEADER_IR3,
-   SYSTEM_VALUE_TCS_HEADER_IR3,
-
-   SYSTEM_VALUE_MAX             /**< Number of values */
-} gl_system_value;
-
-const char *gl_system_value_name(gl_system_value sysval);
-
-/**
- * The possible interpolation qualifiers that can be applied to a fragment
- * shader input in GLSL.
- *
- * Note: INTERP_MODE_NONE must be 0 so that memsetting the
- * ir_variable data structure to 0 causes the default behavior.
- */
-enum glsl_interp_mode
-{
-   INTERP_MODE_NONE = 0,
-   INTERP_MODE_SMOOTH,
-   INTERP_MODE_FLAT,
-   INTERP_MODE_NOPERSPECTIVE,
-   INTERP_MODE_EXPLICIT,
-   INTERP_MODE_COUNT /**< Number of interpolation qualifiers */
-};
-
-enum glsl_interface_packing {
-   GLSL_INTERFACE_PACKING_STD140,
-   GLSL_INTERFACE_PACKING_SHARED,
-   GLSL_INTERFACE_PACKING_PACKED,
-   GLSL_INTERFACE_PACKING_STD430
-};
-
-const char *glsl_interp_mode_name(enum glsl_interp_mode qual);
-
-/**
- * Fragment program results
- */
-typedef enum
-{
-   FRAG_RESULT_DEPTH = 0,
-   FRAG_RESULT_STENCIL = 1,
-   /* If a single color should be written to all render targets, this
-    * register is written.  No FRAG_RESULT_DATAn will be written.
-    */
-   FRAG_RESULT_COLOR = 2,
-   FRAG_RESULT_SAMPLE_MASK = 3,
-
-   /* FRAG_RESULT_DATAn are the per-render-target (GLSL gl_FragData[n]
-    * or ARB_fragment_program fragment.color[n]) color results.  If
-    * any are written, FRAG_RESULT_COLOR will not be written.
-    * FRAG_RESULT_DATA1 and up are simply for the benefit of
-    * gl_frag_result_name() and not to be construed as an upper bound
-    */
-   FRAG_RESULT_DATA0 = 4,
-   FRAG_RESULT_DATA1,
-   FRAG_RESULT_DATA2,
-   FRAG_RESULT_DATA3,
-   FRAG_RESULT_DATA4,
-   FRAG_RESULT_DATA5,
-   FRAG_RESULT_DATA6,
-   FRAG_RESULT_DATA7,
-} gl_frag_result;
-
-const char *gl_frag_result_name(gl_frag_result result);
-
-#define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
-
-/**
- * \brief Layout qualifiers for gl_FragDepth.
- *
- * Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
- * a layout qualifier.
- *
- * \see enum ir_depth_layout
- */
-enum gl_frag_depth_layout
-{
-   FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
-   FRAG_DEPTH_LAYOUT_ANY,
-   FRAG_DEPTH_LAYOUT_GREATER,
-   FRAG_DEPTH_LAYOUT_LESS,
-   FRAG_DEPTH_LAYOUT_UNCHANGED
-};
-
-/**
- * \brief Buffer access qualifiers
- */
-enum gl_access_qualifier
-{
-   ACCESS_COHERENT      = (1 << 0),
-   ACCESS_RESTRICT      = (1 << 1),
-   ACCESS_VOLATILE      = (1 << 2),
-   ACCESS_NON_READABLE  = (1 << 3),
-   ACCESS_NON_WRITEABLE = (1 << 4),
-
-   /** The access may use a non-uniform buffer or image index */
-   ACCESS_NON_UNIFORM   = (1 << 5),
-
-   /* This has the same semantics as NIR_INTRINSIC_CAN_REORDER, only to be
-    * used with loads. In other words, it means that the load can be
-    * arbitrarily reordered, or combined with other loads to the same address.
-    * It is implied by ACCESS_NON_WRITEABLE together with ACCESS_RESTRICT, and
-    * a lack of ACCESS_COHERENT and ACCESS_VOLATILE.
-    */
-   ACCESS_CAN_REORDER = (1 << 6),
-
-   /** Use as little cache space as possible. */
-   ACCESS_STREAM_CACHE_POLICY = (1 << 7),
-};
-
-/**
- * \brief Blend support qualifiers
- */
-enum gl_advanced_blend_mode
-{
-   BLEND_NONE           = 0x0000,
-
-   BLEND_MULTIPLY       = 0x0001,
-   BLEND_SCREEN         = 0x0002,
-   BLEND_OVERLAY        = 0x0004,
-   BLEND_DARKEN         = 0x0008,
-   BLEND_LIGHTEN        = 0x0010,
-   BLEND_COLORDODGE     = 0x0020,
-   BLEND_COLORBURN      = 0x0040,
-   BLEND_HARDLIGHT      = 0x0080,
-   BLEND_SOFTLIGHT      = 0x0100,
-   BLEND_DIFFERENCE     = 0x0200,
-   BLEND_EXCLUSION      = 0x0400,
-   BLEND_HSL_HUE        = 0x0800,
-   BLEND_HSL_SATURATION = 0x1000,
-   BLEND_HSL_COLOR      = 0x2000,
-   BLEND_HSL_LUMINOSITY = 0x4000,
-
-   BLEND_ALL            = 0x7fff,
-};
-
-enum blend_func
-{
-   BLEND_FUNC_ADD,
-   BLEND_FUNC_SUBTRACT,
-   BLEND_FUNC_REVERSE_SUBTRACT,
-   BLEND_FUNC_MIN,
-   BLEND_FUNC_MAX,
-};
-
-enum blend_factor
-{
-   BLEND_FACTOR_ZERO,
-   BLEND_FACTOR_SRC_COLOR,
-   BLEND_FACTOR_DST_COLOR,
-   BLEND_FACTOR_SRC_ALPHA,
-   BLEND_FACTOR_DST_ALPHA,
-   BLEND_FACTOR_CONSTANT_COLOR,
-   BLEND_FACTOR_CONSTANT_ALPHA,
-   BLEND_FACTOR_SRC_ALPHA_SATURATE,
-};
-
-enum gl_tess_spacing
-{
-   TESS_SPACING_UNSPECIFIED,
-   TESS_SPACING_EQUAL,
-   TESS_SPACING_FRACTIONAL_ODD,
-   TESS_SPACING_FRACTIONAL_EVEN,
-};
-
-/**
- * A compare function enum for use in compiler lowering passes.  This is in
- * the same order as GL's compare functions (shifted down by GL_NEVER), and is
- * exactly the same as gallium's PIPE_FUNC_*.
- */
-enum compare_func
-{
-   COMPARE_FUNC_NEVER,
-   COMPARE_FUNC_LESS,
-   COMPARE_FUNC_EQUAL,
-   COMPARE_FUNC_LEQUAL,
-   COMPARE_FUNC_GREATER,
-   COMPARE_FUNC_NOTEQUAL,
-   COMPARE_FUNC_GEQUAL,
-   COMPARE_FUNC_ALWAYS,
-};
-
-/**
- * Arrangements for grouping invocations from NV_compute_shader_derivatives.
- *
- *   The extension provides new layout qualifiers that support two different
- *   arrangements of compute shader invocations for the purpose of derivative
- *   computation.  When specifying
- *
- *     layout(derivative_group_quadsNV) in;
- *
- *   compute shader invocations are grouped into 2x2x1 arrays whose four local
- *   invocation ID values follow the pattern:
- *
- *       +-----------------+------------------+
- *       | (2x+0, 2y+0, z) |  (2x+1, 2y+0, z) |
- *       +-----------------+------------------+
- *       | (2x+0, 2y+1, z) |  (2x+1, 2y+1, z) |
- *       +-----------------+------------------+
- *
- *   where Y increases from bottom to top.  When specifying
- *
- *     layout(derivative_group_linearNV) in;
- *
- *   compute shader invocations are grouped into 2x2x1 arrays whose four local
- *   invocation index values follow the pattern:
- *
- *       +------+------+
- *       | 4n+0 | 4n+1 |
- *       +------+------+
- *       | 4n+2 | 4n+3 |
- *       +------+------+
- *
- *   If neither layout qualifier is specified, derivatives in compute shaders
- *   return zero, which is consistent with the handling of built-in texture
- *   functions like texture() in GLSL 4.50 compute shaders.
- */
-enum gl_derivative_group {
-   DERIVATIVE_GROUP_NONE = 0,
-   DERIVATIVE_GROUP_QUADS,
-   DERIVATIVE_GROUP_LINEAR,
-};
-
-enum float_controls
-{
-   FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE        = 0x0000,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP16              = 0x0001,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP32              = 0x0002,
-   FLOAT_CONTROLS_DENORM_PRESERVE_FP64              = 0x0004,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16         = 0x0008,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32         = 0x0010,
-   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64         = 0x0020,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 = 0x0040,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 = 0x0080,
-   FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64 = 0x0100,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16            = 0x0200,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32            = 0x0400,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64            = 0x0800,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16            = 0x1000,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32            = 0x2000,
-   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64            = 0x4000,
-};
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* SHADER_ENUMS_H */
diff --git a/extra/disassemblers/adreno/util/bitscan.h b/extra/disassemblers/adreno/util/bitscan.h
deleted file mode 100644
index ae93e74697..0000000000
--- a/extra/disassemblers/adreno/util/bitscan.h
+++ /dev/null
@@ -1,326 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef BITSCAN_H
-#define BITSCAN_H
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
-#if defined(__POPCNT__)
-#include <popcntintrin.h>
-#endif
-
-//#include "c99_compat.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/**
- * Find first bit set in word.  Least significant bit is 1.
- * Return 0 if no bits set.
- */
-#ifdef HAVE___BUILTIN_FFS
-#define ffs __builtin_ffs
-#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
-static inline
-int ffs(int i)
-{
-   unsigned long index;
-   if (_BitScanForward(&index, i))
-      return index + 1;
-   else
-      return 0;
-}
-#else
-extern
-int ffs(int i);
-#endif
-
-#ifdef HAVE___BUILTIN_FFSLL
-#define ffsll __builtin_ffsll
-#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
-static inline int
-ffsll(long long int i)
-{
-   unsigned long index;
-   if (_BitScanForward64(&index, i))
-      return index + 1;
-   else
-      return 0;
-}
-#else
-extern int
-ffsll(long long int val);
-#endif
-
-
-/* Destructively loop over all of the bits in a mask as in:
- *
- * while (mymask) {
- *   int i = u_bit_scan(&mymask);
- *   ... process element i
- * }
- *
- */
-static inline int
-u_bit_scan(unsigned *mask)
-{
-   const int i = ffs(*mask) - 1;
-   *mask ^= (1u << i);
-   return i;
-}
-
-static inline int
-u_bit_scan64(uint64_t *mask)
-{
-   const int i = ffsll(*mask) - 1;
-   *mask ^= (((uint64_t)1) << i);
-   return i;
-}
-
-/* Determine if an unsigned value is a power of two.
- *
- * \note
- * Zero is treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_or_zero(unsigned v)
-{
-   return (v & (v - 1)) == 0;
-}
-
-/* Determine if an uint64_t value is a power of two.
- *
- * \note
- * Zero is treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_or_zero64(uint64_t v)
-{
-   return (v & (v - 1)) == 0;
-}
-
-/* Determine if an unsigned value is a power of two.
- *
- * \note
- * Zero is \b not treated as a power of two.
- */
-static inline bool
-util_is_power_of_two_nonzero(unsigned v)
-{
-   /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT.  The latter
-    * indicates the existence of the __builtin_popcount function.  The former
-    * indicates that _mm_popcnt_u32 exists and is a native instruction.
-    *
-    * The other alternative is to use SSE 4.2 compile-time flags.  This has
-    * two drawbacks.  First, there is currently no build infrastructure for
-    * SSE 4.2 (only 4.1), so that would have to be added.  Second, some AMD
-    * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
-    */
-#ifdef __POPCNT__
-   return _mm_popcnt_u32(v) == 1;
-#else
-   return v != 0 && (v & (v - 1)) == 0;
-#endif
-}
-
-/* For looping over a bitmask when you want to loop over consecutive bits
- * manually, for example:
- *
- * while (mask) {
- *    int start, count, i;
- *
- *    u_bit_scan_consecutive_range(&mask, &start, &count);
- *
- *    for (i = 0; i < count; i++)
- *       ... process element (start+i)
- * }
- */
-static inline void
-u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
-{
-   if (*mask == 0xffffffff) {
-      *start = 0;
-      *count = 32;
-      *mask = 0;
-      return;
-   }
-   *start = ffs(*mask) - 1;
-   *count = ffs(~(*mask >> *start)) - 1;
-   *mask &= ~(((1u << *count) - 1) << *start);
-}
-
-static inline void
-u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
-{
-   if (*mask == ~0ull) {
-      *start = 0;
-      *count = 64;
-      *mask = 0;
-      return;
-   }
-   *start = ffsll(*mask) - 1;
-   *count = ffsll(~(*mask >> *start)) - 1;
-   *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
-}
-
-
-/**
- * Find last bit set in a word.  The least significant bit is 1.
- * Return 0 if no bits are set.
- * Essentially ffs() in the reverse direction.
- */
-static inline unsigned
-util_last_bit(unsigned u)
-{
-#if defined(HAVE___BUILTIN_CLZ)
-   return u == 0 ? 0 : 32 - __builtin_clz(u);
-#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
-   unsigned long index;
-   if (_BitScanReverse(&index, u))
-      return index + 1;
-   else
-      return 0;
-#else
-   unsigned r = 0;
-   while (u) {
-      r++;
-      u >>= 1;
-   }
-   return r;
-#endif
-}
-
-/**
- * Find last bit set in a word.  The least significant bit is 1.
- * Return 0 if no bits are set.
- * Essentially ffsll() in the reverse direction.
- */
-static inline unsigned
-util_last_bit64(uint64_t u)
-{
-#if defined(HAVE___BUILTIN_CLZLL)
-   return u == 0 ? 0 : 64 - __builtin_clzll(u);
-#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
-   unsigned long index;
-   if (_BitScanReverse64(&index, u))
-      return index + 1;
-   else
-      return 0;
-#else
-   unsigned r = 0;
-   while (u) {
-      r++;
-      u >>= 1;
-   }
-   return r;
-#endif
-}
-
-/**
- * Find last bit in a word that does not match the sign bit. The least
- * significant bit is 1.
- * Return 0 if no bits are set.
- */
-static inline unsigned
-util_last_bit_signed(int i)
-{
-   if (i >= 0)
-      return util_last_bit(i);
-   else
-      return util_last_bit(~(unsigned)i);
-}
-
-/* Returns a bitfield in which the first count bits starting at start are
- * set.
- */
-static inline unsigned
-u_bit_consecutive(unsigned start, unsigned count)
-{
-   assert(start + count <= 32);
-   if (count == 32)
-      return ~0;
-   return ((1u << count) - 1) << start;
-}
-
-static inline uint64_t
-u_bit_consecutive64(unsigned start, unsigned count)
-{
-   assert(start + count <= 64);
-   if (count == 64)
-      return ~(uint64_t)0;
-   return (((uint64_t)1 << count) - 1) << start;
-}
-
-/**
- * Return number of bits set in n.
- */
-static inline unsigned
-util_bitcount(unsigned n)
-{
-#if defined(HAVE___BUILTIN_POPCOUNT)
-   return __builtin_popcount(n);
-#else
-   /* K&R classic bitcount.
-    *
-    * For each iteration, clear the LSB from the bitfield.
-    * Requires only one iteration per set bit, instead of
-    * one iteration per bit less than highest set bit.
-    */
-   unsigned bits;
-   for (bits = 0; n; bits++) {
-      n &= n - 1;
-   }
-   return bits;
-#endif
-}
-
-static inline unsigned
-util_bitcount64(uint64_t n)
-{
-#ifdef HAVE___BUILTIN_POPCOUNTLL
-   return __builtin_popcountll(n);
-#else
-   return util_bitcount(n) + util_bitcount(n >> 32);
-#endif
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* BITSCAN_H */
diff --git a/extra/disassemblers/adreno/util/bitset.h b/extra/disassemblers/adreno/util/bitset.h
deleted file mode 100644
index 264144c39b..0000000000
--- a/extra/disassemblers/adreno/util/bitset.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2006  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file bitset.h
- * \brief Bitset of arbitrary size definitions.
- * \author Michal Krol
- */
-
-#ifndef BITSET_H
-#define BITSET_H
-
-//#include "util/bitscan.h"
-//#include "util/macros.h"
-
-/****************************************************************************
- * generic bitset implementation
- */
-
-#define BITSET_WORD unsigned int
-#define BITSET_WORDBITS (sizeof (BITSET_WORD) * 8)
-
-/* bitset declarations
- */
-#define BITSET_WORDS(bits) (((bits) + BITSET_WORDBITS - 1) / BITSET_WORDBITS)
-#define BITSET_DECLARE(name, bits) BITSET_WORD name[BITSET_WORDS(bits)]
-
-/* bitset operations
- */
-#define BITSET_COPY(x, y) memcpy( (x), (y), sizeof (x) )
-#define BITSET_EQUAL(x, y) (memcmp( (x), (y), sizeof (x) ) == 0)
-#define BITSET_ZERO(x) memset( (x), 0, sizeof (x) )
-#define BITSET_ONES(x) memset( (x), 0xff, sizeof (x) )
-
-#define BITSET_BITWORD(b) ((b) / BITSET_WORDBITS)
-#define BITSET_BIT(b) (1u << ((b) % BITSET_WORDBITS))
-
-/* single bit operations
- */
-#define BITSET_TEST(x, b) (((x)[BITSET_BITWORD(b)] & BITSET_BIT(b)) != 0)
-#define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
-#define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
-
-#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
-#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
-
-/* bit range operations
- */
-#define BITSET_TEST_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   (((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \
-   (assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0))
-#define BITSET_SET_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   ((x)[BITSET_BITWORD(b)] |= BITSET_RANGE(b, e)) : \
-   (assert (!"BITSET_SET_RANGE: bit range crosses word boundary"), 0))
-#define BITSET_CLEAR_RANGE(x, b, e) \
-   (BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
-   ((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
-   (assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
-
-/* Get first bit set in a bitset.
- */
-static inline int
-__bitset_ffs(const BITSET_WORD *x, int n)
-{
-   int i;
-
-   for (i = 0; i < n; i++) {
-      if (x[i])
-	 return ffs(x[i]) + BITSET_WORDBITS * i;
-   }
-
-   return 0;
-}
-
-#define BITSET_FFS(x) __bitset_ffs(x, ARRAY_SIZE(x))
-
-static inline unsigned
-__bitset_next_set(unsigned i, BITSET_WORD *tmp,
-                  const BITSET_WORD *set, unsigned size)
-{
-   unsigned bit, word;
-
-   /* NOTE: The initial conditions for this function are very specific.  At
-    * the start of the loop, the tmp variable must be set to *set and the
-    * initial i value set to 0.  This way, if there is a bit set in the first
-    * word, we ignore the i-value and just grab that bit (so 0 is ok, even
-    * though 0 may be returned).  If the first word is 0, then the value of
-    * `word` will be 0 and we will go on to look at the second word.
-    */
-   word = BITSET_BITWORD(i);
-   while (*tmp == 0) {
-      word++;
-
-      if (word >= BITSET_WORDS(size))
-         return size;
-
-      *tmp = set[word];
-   }
-
-   /* Find the next set bit in the non-zero word */
-   bit = ffs(*tmp) - 1;
-
-   /* Unset the bit */
-   *tmp &= ~(1ull << bit);
-
-   return word * BITSET_WORDBITS + bit;
-}
-
-/**
- * Iterates over each set bit in a set
- *
- * @param __i    iteration variable, bit number
- * @param __set  the bitset to iterate (will not be modified)
- * @param __size number of bits in the set to consider
- */
-#define BITSET_FOREACH_SET(__i, __set, __size) \
-   for (BITSET_WORD __tmp = *(__set), *__foo = &__tmp; __foo != NULL; __foo = NULL) \
-      for (__i = 0; \
-           (__i = __bitset_next_set(__i, &__tmp, __set, __size)) < __size;)
-
-#ifdef __cplusplus
-
-/**
- * Simple C++ wrapper of a bitset type of static size, with value semantics
- * and basic bitwise arithmetic operators.  The operators defined below are
- * expected to have the same semantics as the same operator applied to other
- * fundamental integer types.  T is the name of the struct to instantiate
- * it as, and N is the number of bits in the bitset.
- */
-#define DECLARE_BITSET_T(T, N) struct T {                       \
-      EXPLICIT_CONVERSION                                       \
-      operator bool() const                                     \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            if (words[i])                                       \
-               return true;                                     \
-         return false;                                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator=(int x)                                          \
-      {                                                         \
-         const T c = {{ (BITSET_WORD)x }};                      \
-         return *this = c;                                      \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator==(const T &b, const T &c)                        \
-      {                                                         \
-         return BITSET_EQUAL(b.words, c.words);                 \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator!=(const T &b, const T &c)                        \
-      {                                                         \
-         return !(b == c);                                      \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator==(const T &b, int x)                             \
-      {                                                         \
-         const T c = {{ (BITSET_WORD)x }};                      \
-         return b == c;                                         \
-      }                                                         \
-                                                                \
-      friend bool                                               \
-      operator!=(const T &b, int x)                             \
-      {                                                         \
-         return !(b == x);                                      \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator~(const T &b)                                     \
-      {                                                         \
-         T c;                                                   \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            c.words[i] = ~b.words[i];                           \
-         return c;                                              \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator|=(const T &b)                                    \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            words[i] |= b.words[i];                             \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator|(const T &b, const T &c)                         \
-      {                                                         \
-         T d = b;                                               \
-         d |= c;                                                \
-         return d;                                              \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      operator&=(const T &b)                                    \
-      {                                                         \
-         for (unsigned i = 0; i < BITSET_WORDS(N); i++)         \
-            words[i] &= b.words[i];                             \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      friend T                                                  \
-      operator&(const T &b, const T &c)                         \
-      {                                                         \
-         T d = b;                                               \
-         d &= c;                                                \
-         return d;                                              \
-      }                                                         \
-                                                                \
-      bool                                                      \
-      test(unsigned i) const                                    \
-      {                                                         \
-         return BITSET_TEST(words, i);                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      set(unsigned i)                                           \
-      {                                                         \
-         BITSET_SET(words, i);                                  \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      T &                                                       \
-      clear(unsigned i)                                         \
-      {                                                         \
-         BITSET_CLEAR(words, i);                                \
-         return *this;                                          \
-      }                                                         \
-                                                                \
-      BITSET_WORD words[BITSET_WORDS(N)];                       \
-   }
-
-#endif
-
-#endif
diff --git a/extra/disassemblers/adreno/util/list.h b/extra/disassemblers/adreno/util/list.h
deleted file mode 100644
index 7f36e8c39d..0000000000
--- a/extra/disassemblers/adreno/util/list.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2006 VMware, Inc., Bismarck, ND. USA.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- **************************************************************************/
-
-/**
- * \file
- * List macros heavily inspired by the Linux kernel
- * list handling. No list looping yet.
- *
- * Is not threadsafe, so common operations need to
- * be protected using an external mutex.
- */
-
-#ifndef _UTIL_LIST_H_
-#define _UTIL_LIST_H_
-
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <assert.h>
-
-#ifdef DEBUG
-#  define list_assert(cond, msg)  assert(cond && msg)
-#else
-#  define list_assert(cond, msg)  (void)(0 && (cond))
-#endif
-
-struct list_head
-{
-    struct list_head *prev;
-    struct list_head *next;
-};
-
-static inline void list_inithead(struct list_head *item)
-{
-    item->prev = item;
-    item->next = item;
-}
-
-static inline void list_add(struct list_head *item, struct list_head *list)
-{
-    item->prev = list;
-    item->next = list->next;
-    list->next->prev = item;
-    list->next = item;
-}
-
-static inline void list_addtail(struct list_head *item, struct list_head *list)
-{
-    item->next = list;
-    item->prev = list->prev;
-    list->prev->next = item;
-    list->prev = item;
-}
-
-static inline bool list_is_empty(const struct list_head *list);
-
-static inline void list_replace(struct list_head *from, struct list_head *to)
-{
-    if (list_is_empty(from)) {
-        list_inithead(to);
-    } else {
-        to->prev = from->prev;
-        to->next = from->next;
-        from->next->prev = to;
-        from->prev->next = to;
-    }
-}
-
-static inline void list_del(struct list_head *item)
-{
-    item->prev->next = item->next;
-    item->next->prev = item->prev;
-    item->prev = item->next = NULL;
-}
-
-static inline void list_delinit(struct list_head *item)
-{
-    item->prev->next = item->next;
-    item->next->prev = item->prev;
-    item->next = item;
-    item->prev = item;
-}
-
-static inline bool list_is_empty(const struct list_head *list)
-{
-   return list->next == list;
-}
-
-/**
- * Returns whether the list has exactly one element.
- */
-static inline bool list_is_singular(const struct list_head *list)
-{
-   return list->next != NULL && list->next != list && list->next->next == list;
-}
-
-static inline unsigned list_length(const struct list_head *list)
-{
-   struct list_head *node;
-   unsigned length = 0;
-   for (node = list->next; node != list; node = node->next)
-      length++;
-   return length;
-}
-
-static inline void list_splice(struct list_head *src, struct list_head *dst)
-{
-   if (list_is_empty(src))
-      return;
-
-   src->next->prev = dst;
-   src->prev->next = dst->next;
-   dst->next->prev = src->prev;
-   dst->next = src->next;
-}
-
-static inline void list_splicetail(struct list_head *src, struct list_head *dst)
-{
-   if (list_is_empty(src))
-      return;
-
-   src->prev->next = dst;
-   src->next->prev = dst->prev;
-   dst->prev->next = src->next;
-   dst->prev = src->prev;
-}
-
-static inline void list_validate(const struct list_head *list)
-{
-   struct list_head *node;
-   assert(list->next->prev == list && list->prev->next == list);
-   for (node = list->next; node != list; node = node->next)
-      assert(node->next->prev == node && node->prev->next == node);
-}
-
-#define LIST_ENTRY(__type, __item, __field)   \
-    ((__type *)(((char *)(__item)) - offsetof(__type, __field)))
-
-/**
- * Cast from a pointer to a member of a struct back to the containing struct.
- *
- * 'sample' MUST be initialized, or else the result is undefined!
- */
-#ifndef container_of
-#define container_of(ptr, sample, member)				\
-    (void *)((char *)(ptr)						\
-	     - ((char *)&(sample)->member - (char *)(sample)))
-#endif
-
-#define list_first_entry(ptr, type, member) \
-        LIST_ENTRY(type, (ptr)->next, member)
-
-#define list_last_entry(ptr, type, member) \
-        LIST_ENTRY(type, (ptr)->prev, member)
-
-
-#define LIST_FOR_EACH_ENTRY(pos, head, member)				\
-   for (pos = NULL, pos = container_of((head)->next, pos, member);	\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.next, pos, member))
-
-#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
-   for (pos = NULL, pos = container_of((head)->next, pos, member),	\
-	storage = container_of(pos->member.next, pos, member);	\
-	&pos->member != (head);						\
-	pos = storage, storage = container_of(storage->member.next, storage, member))
-
-#define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member)	\
-   for (pos = NULL, pos = container_of((head)->prev, pos, member),	\
-	storage = container_of(pos->member.prev, pos, member);		\
-	&pos->member != (head);						\
-	pos = storage, storage = container_of(storage->member.prev, storage, member))
-
-#define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member)		\
-   for (pos = NULL, pos = container_of((start), pos, member);		\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.next, pos, member))
-
-#define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member)		\
-   for (pos = NULL, pos = container_of((start), pos, member);		\
-	&pos->member != (head);						\
-	pos = container_of(pos->member.prev, pos, member))
-
-#define list_for_each_entry(type, pos, head, member)                    \
-   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.next, member),               \
-	list_assert(pos == __next, "use _safe iterator"),               \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_safe(type, pos, head, member)               \
-   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = __next,                                                   \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_rev(type, pos, head, member)                \
-   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
-	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.prev, member),               \
-	list_assert(pos == __prev, "use _safe iterator"),               \
-	__prev = LIST_ENTRY(type, __prev->member.prev, member))
-
-#define list_for_each_entry_safe_rev(type, pos, head, member)           \
-   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
-	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
-	&pos->member != (head);                                         \
-	pos = __prev,                                                   \
-        __prev = LIST_ENTRY(type, __prev->member.prev, member))
-
-#define list_for_each_entry_from(type, pos, start, head, member)        \
-   for (type *pos = LIST_ENTRY(type, (start), member);                  \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.next, member))
-
-#define list_for_each_entry_from_safe(type, pos, start, head, member)   \
-   for (type *pos = LIST_ENTRY(type, (start), member),                  \
-	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
-	&pos->member != (head);                                         \
-	pos = __next,                                                   \
-	__next = LIST_ENTRY(type, __next->member.next, member))
-
-#define list_for_each_entry_from_rev(type, pos, start, head, member)    \
-   for (type *pos = LIST_ENTRY(type, (start), member);                  \
-	&pos->member != (head);                                         \
-	pos = LIST_ENTRY(type, pos->member.prev, member))
-
-#define list_pair_for_each_entry(type, pos1, pos2, head1, head2, member) \
-   for (type *pos1 = LIST_ENTRY(type, (head1)->next, member),           \
-             *pos2 = LIST_ENTRY(type, (head2)->next, member);           \
-        &pos1->member != (head1) && &pos2->member != (head2);           \
-	pos1 = LIST_ENTRY(type, pos1->member.next, member),               \
-	pos2 = LIST_ENTRY(type, pos2->member.next, member))
-
-#endif /*_UTIL_LIST_H_*/
diff --git a/extra/disassemblers/adreno/util/macros.h b/extra/disassemblers/adreno/util/macros.h
deleted file mode 100644
index a36bdd411e..0000000000
--- a/extra/disassemblers/adreno/util/macros.h
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef UTIL_MACROS_H
-#define UTIL_MACROS_H
-
-#include <assert.h>
-
-/* Compute the size of an array */
-#ifndef ARRAY_SIZE
-#  define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
-/* For compatibility with Clang's __has_builtin() */
-#ifndef __has_builtin
-#  define __has_builtin(x) 0
-#endif
-
-/**
- * __builtin_expect macros
- */
-#if !defined(HAVE___BUILTIN_EXPECT)
-#  define __builtin_expect(x, y) (x)
-#endif
-
-#ifndef likely
-#  ifdef HAVE___BUILTIN_EXPECT
-#    define likely(x)   __builtin_expect(!!(x), 1)
-#    define unlikely(x) __builtin_expect(!!(x), 0)
-#  else
-#    define likely(x)   (x)
-#    define unlikely(x) (x)
-#  endif
-#endif
-
-
-/**
- * Static (compile-time) assertion.
- * Basically, use COND to dimension an array.  If COND is false/zero the
- * array size will be -1 and we'll get a compilation error.
- */
-#define STATIC_ASSERT(COND) \
-   do { \
-      (void) sizeof(char [1 - 2*!(COND)]); \
-   } while (0)
-
-
-/**
- * Unreachable macro. Useful for suppressing "control reaches end of non-void
- * function" warnings.
- */
-#if defined(HAVE___BUILTIN_UNREACHABLE) || __has_builtin(__builtin_unreachable)
-#define unreachable(str)    \
-do {                        \
-   assert(!str);            \
-   __builtin_unreachable(); \
-} while (0)
-#elif defined (_MSC_VER)
-#define unreachable(str)    \
-do {                        \
-   assert(!str);            \
-   __assume(0);             \
-} while (0)
-#else
-#define unreachable(str) assert(!str)
-#endif
-
-/**
- * Assume macro. Useful for expressing our assumptions to the compiler,
- * typically for purposes of silencing warnings.
- */
-#if __has_builtin(__builtin_assume)
-#define assume(expr)       \
-do {                       \
-   assert(expr);           \
-   __builtin_assume(expr); \
-} while (0)
-#elif defined HAVE___BUILTIN_UNREACHABLE
-#define assume(expr) ((expr) ? ((void) 0) \
-                             : (assert(!"assumption failed"), \
-                                __builtin_unreachable()))
-#elif defined (_MSC_VER)
-#define assume(expr) __assume(expr)
-#else
-#define assume(expr) assert(expr)
-#endif
-
-/* Attribute const is used for functions that have no effects other than their
- * return value, and only rely on the argument values to compute the return
- * value.  As a result, calls to it can be CSEed.  Note that using memory
- * pointed to by the arguments is not allowed for const functions.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_CONST
-#define ATTRIBUTE_CONST __attribute__((__const__))
-#else
-#define ATTRIBUTE_CONST
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
-#define FLATTEN __attribute__((__flatten__))
-#else
-#define FLATTEN
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_FORMAT
-#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
-#else
-#define PRINTFLIKE(f, a)
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC
-#define MALLOCLIKE __attribute__((__malloc__))
-#else
-#define MALLOCLIKE
-#endif
-
-/* Forced function inlining */
-/* Note: Clang also sets __GNUC__ (see other cases below) */
-#ifndef ALWAYS_INLINE
-#  if defined(__GNUC__)
-#    define ALWAYS_INLINE inline __attribute__((always_inline))
-#  elif defined(_MSC_VER)
-#    define ALWAYS_INLINE __forceinline
-#  else
-#    define ALWAYS_INLINE inline
-#  endif
-#endif
-
-/* Used to optionally mark structures with misaligned elements or size as
- * packed, to trade off performance for space.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_PACKED
-#define PACKED __attribute__((__packed__))
-#else
-#define PACKED
-#endif
-
-/* Attribute pure is used for functions that have no effects other than their
- * return value.  As a result, calls to it can be dead code eliminated.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_PURE
-#define ATTRIBUTE_PURE __attribute__((__pure__))
-#else
-#define ATTRIBUTE_PURE
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
-#define ATTRIBUTE_RETURNS_NONNULL __attribute__((__returns_nonnull__))
-#else
-#define ATTRIBUTE_RETURNS_NONNULL
-#endif
-
-#ifndef NORETURN
-#  ifdef _MSC_VER
-#    define NORETURN __declspec(noreturn)
-#  elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
-#    define NORETURN __attribute__((__noreturn__))
-#  else
-#    define NORETURN
-#  endif
-#endif
-
-#ifdef __cplusplus
-/**
- * Macro function that evaluates to true if T is a trivially
- * destructible type -- that is, if its (non-virtual) destructor
- * performs no action and all member variables and base classes are
- * trivially destructible themselves.
- */
-#   if (defined(__clang__) && defined(__has_feature))
-#      if __has_feature(has_trivial_destructor)
-#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#      endif
-#   elif defined(__GNUC__)
-#      if ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
-#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#      endif
-#   elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#      define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
-#   endif
-#   ifndef HAS_TRIVIAL_DESTRUCTOR
-       /* It's always safe (if inefficient) to assume that a
-        * destructor is non-trivial.
-        */
-#      define HAS_TRIVIAL_DESTRUCTOR(T) (false)
-#   endif
-#endif
-
-/**
- * PUBLIC/USED macros
- *
- * If we build the library with gcc's -fvisibility=hidden flag, we'll
- * use the PUBLIC macro to mark functions that are to be exported.
- *
- * We also need to define a USED attribute, so the optimizer doesn't
- * inline a static function that we later use in an alias. - ajax
- */
-#ifndef PUBLIC
-#  if defined(__GNUC__)
-#    define PUBLIC __attribute__((visibility("default")))
-#    define USED __attribute__((used))
-#  elif defined(_MSC_VER)
-#    define PUBLIC __declspec(dllexport)
-#    define USED
-#  else
-#    define PUBLIC
-#    define USED
-#  endif
-#endif
-
-/**
- * UNUSED marks variables (or sometimes functions) that have to be defined,
- * but are sometimes (or always) unused beyond that. A common case is for
- * a function parameter to be used in some build configurations but not others.
- * Another case is fallback vfuncs that don't do anything with their params.
- *
- * Note that this should not be used for identifiers used in `assert()`;
- * see ASSERTED below.
- */
-#ifdef HAVE_FUNC_ATTRIBUTE_UNUSED
-#define UNUSED __attribute__((unused))
-#else
-#define UNUSED
-#endif
-
-/**
- * Use ASSERTED to indicate that an identifier is unused outside of an `assert()`,
- * so that assert-free builds don't get "unused variable" warnings.
- */
-#ifdef NDEBUG
-#define ASSERTED UNUSED
-#else
-#define ASSERTED
-#endif
-
-#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
-#define MUST_CHECK __attribute__((warn_unused_result))
-#else
-#define MUST_CHECK
-#endif
-
-#if defined(__GNUC__)
-#define ATTRIBUTE_NOINLINE __attribute__((noinline))
-#else
-#define ATTRIBUTE_NOINLINE
-#endif
-
-
-/**
- * Check that STRUCT::FIELD can hold MAXVAL.  We use a lot of bitfields
- * in Mesa/gallium.  We have to be sure they're of sufficient size to
- * hold the largest expected value.
- * Note that with MSVC, enums are signed and enum bitfields need one extra
- * high bit (always zero) to ensure the max value is handled correctly.
- * This macro will detect that with MSVC, but not GCC.
- */
-#define ASSERT_BITFIELD_SIZE(STRUCT, FIELD, MAXVAL) \
-   do { \
-      ASSERTED STRUCT s; \
-      s.FIELD = (MAXVAL); \
-      assert((int) s.FIELD == (MAXVAL) && "Insufficient bitfield size!"); \
-   } while (0)
-
-
-/** Compute ceiling of integer quotient of A divided by B. */
-#define DIV_ROUND_UP( A, B )  ( ((A) + (B) - 1) / (B) )
-
-/** Clamp X to [MIN,MAX].  Turn NaN into MIN, arbitrarily. */
-#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
-
-/** Minimum of two values: */
-#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
-
-/** Maximum of two values: */
-#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-
-/** Minimum and maximum of three values: */
-#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
-#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
-
-/** Align a value to a power of two */
-#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
-
-/**
- * Macro for declaring an explicit conversion operator.  Defaults to an
- * implicit conversion if C++11 is not supported.
- */
-#if __cplusplus >= 201103L
-#define EXPLICIT_CONVERSION explicit
-#elif defined(__cplusplus)
-#define EXPLICIT_CONVERSION
-#endif
-
-/** Set a single bit */
-#define BITFIELD_BIT(b)      (1u << (b))
-/** Set all bits up to excluding bit b */
-#define BITFIELD_MASK(b)      \
-   ((b) == 32 ? (~0u) : BITFIELD_BIT((b) % 32) - 1)
-/** Set count bits starting from bit b  */
-#define BITFIELD_RANGE(b, count) \
-   (BITFIELD_MASK((b) + (count)) & ~BITFIELD_MASK(b))
-
-/** Set a single bit */
-#define BITFIELD64_BIT(b)      (1ull << (b))
-/** Set all bits up to excluding bit b */
-#define BITFIELD64_MASK(b)      \
-   ((b) == 64 ? (~0ull) : BITFIELD64_BIT(b) - 1)
-/** Set count bits starting from bit b  */
-#define BITFIELD64_RANGE(b, count) \
-   (BITFIELD64_MASK((b) + (count)) & ~BITFIELD64_MASK(b))
-
-/* TODO: In future we should try to move this to u_debug.h once header
- * dependencies are reorganised to allow this.
- */
-enum pipe_debug_type
-{
-   PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
-   PIPE_DEBUG_TYPE_ERROR,
-   PIPE_DEBUG_TYPE_SHADER_INFO,
-   PIPE_DEBUG_TYPE_PERF_INFO,
-   PIPE_DEBUG_TYPE_INFO,
-   PIPE_DEBUG_TYPE_FALLBACK,
-   PIPE_DEBUG_TYPE_CONFORMANCE,
-};
-
-#endif /* UTIL_MACROS_H */
diff --git a/test/external/external_benchmark_load_stable_diffusion.py b/test/external/external_benchmark_load_stable_diffusion.py
deleted file mode 100644
index 6ee0d0a325..0000000000
--- a/test/external/external_benchmark_load_stable_diffusion.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from tinygrad.helpers import fetch, Timing
-from tinygrad.device import Device
-from tinygrad.nn.state import torch_load, load_state_dict
-from examples.stable_diffusion import StableDiffusion
-
-# run "sudo purge" before testing on OS X to avoid the memory cache
-
-if __name__ == "__main__":
-  fn = fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt')
-  model = StableDiffusion()
-  with Timing():
-    load_state_dict(model, torch_load(fn)['state_dict'], strict=False)
-    Device[Device.DEFAULT].synchronize()
diff --git a/test/external/external_test_embedding.py b/test/external/external_test_embedding.py
deleted file mode 100644
index 9d6bd7f2b0..0000000000
--- a/test/external/external_test_embedding.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from tinygrad.tensor import Tensor
-from tinygrad.nn import Embedding
-
-if __name__ == "__main__":
-  vocab_size = 50257
-  dim = 128
-  test = Embedding(vocab_size, dim)
-  ret = test(Tensor([[1,2,3]])).numpy()
diff --git a/test/external/external_test_hsa_driver.py b/test/external/external_test_hsa_driver.py
deleted file mode 100644
index 737dbc29b0..0000000000
--- a/test/external/external_test_hsa_driver.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import ctypes, unittest
-from tinygrad.helpers import init_c_struct_t
-from tinygrad.device import Device, Buffer
-from tinygrad.dtype import dtypes
-from tinygrad.runtime.support.hsa import AQLQueue
-from tinygrad.runtime.graph.hsa import VirtAQLQueue, HSAGraph
-from tinygrad.engine.schedule import ExecItem
-from tinygrad.engine.realize import BufferXfer
-from tinygrad.uop.ops import UOp, Ops
-
-def get_hsa_inc_prog(dev, inc=1):
-  prg = f"""
-extern "C" __attribute__((global)) void test_inc(int* data0) {{
-  data0[0] = (data0[0]+{inc});
-}}
-"""
-  return dev.runtime("test_inc", dev.compiler.compile(prg))
-
-def get_hsa_buffer_and_kernargs(dev):
-  test_buf = Buffer(Device.DEFAULT, 1, dtypes.int)
-  test_buf.copyin(memoryview(bytearray(4))) # zero mem
-  assert test_buf.as_buffer().cast('I')[0] == 0 # check mem is visible + sync to exec
-
-  args_struct_t = init_c_struct_t(tuple([('f0', ctypes.c_void_p)]))
-  kernargs = dev.alloc_kernargs(8)
-  args_st = args_struct_t.from_address(kernargs)
-  args_st.__setattr__('f0', test_buf._buf)
-  dev.flush_hdp()
-  return test_buf, kernargs
-
-@unittest.skipUnless(Device.DEFAULT == "HSA", "only run on HSA")
-class TestHSADriver(unittest.TestCase):
-  def test_hsa_simple_enqueue(self):
-    dev = Device[Device.DEFAULT]
-    queue = AQLQueue(dev, sz=256)
-
-    clprg = get_hsa_inc_prog(dev, inc=1)
-    test_buf, kernargs = get_hsa_buffer_and_kernargs(dev)
-
-    queue.submit_kernel(clprg, [1,1,1], [1,1,1], kernargs)
-    queue.wait()
-
-    assert test_buf.as_buffer().cast('I')[0] == 1, f"{test_buf.as_buffer().cast('I')[0]} != 1, all packets executed?"
-    del queue
-
-  def test_hsa_ring_enqueue(self):
-    dev = Device[Device.DEFAULT]
-
-    queue_size = 256
-    exec_cnt = int(queue_size * 1.5)
-    queue = AQLQueue(dev, sz=queue_size)
-
-    clprg_inc1 = get_hsa_inc_prog(dev, inc=1)
-    clprg_inc2 = get_hsa_inc_prog(dev, inc=2)
-    test_buf, kernargs = get_hsa_buffer_and_kernargs(dev)
-
-    for _ in range(exec_cnt):
-      queue.submit_kernel(clprg_inc1, [1,1,1], [1,1,1], kernargs)
-    for _ in range(exec_cnt):
-      queue.submit_kernel(clprg_inc2, [1,1,1], [1,1,1], kernargs)
-    queue.wait()
-
-    expected = exec_cnt + exec_cnt * 2
-    assert test_buf.as_buffer().cast('I')[0] == expected, f"{test_buf.as_buffer().cast('I')[0]} != {expected}, all packets executed?"
-    del queue
-
-  def test_hsa_blit_enqueue(self):
-    dev = Device[Device.DEFAULT]
-
-    queue_size = 256
-    exec_cnt = 178
-    queue = AQLQueue(dev, sz=queue_size)
-
-    test_buf, kernargs = get_hsa_buffer_and_kernargs(dev)
-
-    # Using VirtAQLQueue to blit them
-    virt_queue_packets_cnt = 31
-    virt_queue = VirtAQLQueue(dev, sz=virt_queue_packets_cnt)
-
-    clprogs = []
-    sum_per_blit = 0
-    for i in range(virt_queue_packets_cnt):
-      sum_per_blit += i+1
-      clprogs.append(get_hsa_inc_prog(dev, inc=i+1))
-
-    for i in range(virt_queue_packets_cnt):
-      virt_queue.submit_kernel(clprogs[i], [1,1,1], [1,1,1], kernargs)
-
-    for _ in range(exec_cnt):
-      queue.blit_packets(virt_queue.queue_base, virt_queue.packets_count)
-    queue.wait()
-
-    expected = exec_cnt * sum_per_blit
-    assert test_buf.as_buffer().cast('I')[0] == expected, f"{test_buf.as_buffer().cast('I')[0]} != {expected}, all packets executed?"
-    del queue, clprogs
-
-  def test_hsa_copies_sync(self):
-    d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"
-
-    test_buf0 = Buffer(d0, 1, dtypes.int)
-    test_buf1 = Buffer(d0, 1, dtypes.int)
-    test_buf2 = Buffer(d1, 1, dtypes.int)
-    test_buf0.copyin(memoryview(bytearray(1*4)))
-    test_buf1.copyin(memoryview(bytearray(1*4)))
-    test_buf2.copyin(memoryview(bytearray(1*4)))
-
-    jit_cache = [ExecItem(UOp(Ops.NOOP), [test_buf0, test_buf2], prg=BufferXfer(test_buf0.nbytes, test_buf0.device, test_buf2.device)),
-                 ExecItem(UOp(Ops.NOOP), [test_buf2, test_buf1], prg=BufferXfer(test_buf2.nbytes, test_buf2.device, test_buf1.device))]
-    graph = HSAGraph(jit_cache, [], {})
-
-    for i in range(10000):
-      test_buf0.copyin(memoryview(bytearray(1*4)))
-      test_buf2.copyin(memoryview(bytearray(int.to_bytes(4, length=1*4, byteorder='little'))))
-      graph([], {})
-      assert test_buf0.as_buffer().cast('I')[0] == 4
-      assert test_buf2.as_buffer().cast('I')[0] == 0
-
-if __name__ == '__main__':
-  unittest.main()
diff --git a/test/external/external_test_yolo.py b/test/external/external_test_yolo.py
deleted file mode 100644
index f28f23aa5f..0000000000
--- a/test/external/external_test_yolo.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import unittest
-from pathlib import Path
-
-import cv2
-
-from examples.yolov3 import Darknet, infer, show_labels
-from tinygrad.helpers import fetch
-
-chicken_img = cv2.imread(str(Path(__file__).parent.parent / 'models/efficientnet/Chicken.jpg'))
-car_img = cv2.imread(str(Path(__file__).parent.parent / 'models/efficientnet/car.jpg'))
-
-class TestYOLO(unittest.TestCase):
-  @classmethod
-  def setUpClass(cls):
-    cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg").read_bytes())
-    print("Loading weights file (237MB). This might take a while…")
-    cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
-
-  @classmethod
-  def tearDownClass(cls):
-    del cls.model
-
-  def test_chicken(self):
-    labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
-    self.assertEqual(labels, ["bird"])
-
-  def test_car(self):
-    labels = show_labels(infer(self.model, car_img))
-    self.assertEqual(labels, ["car"])
-
-if __name__ == '__main__':
-  unittest.main()
diff --git a/test/external/graph_batchnorm.py b/test/external/graph_batchnorm.py
deleted file mode 100644
index 59e3b7961a..0000000000
--- a/test/external/graph_batchnorm.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import unittest
-from tinygrad.nn.state import get_parameters
-from tinygrad.tensor import Tensor
-from tinygrad.nn import Conv2d, BatchNorm2d, optim
-
-def model_step(lm):
-  with Tensor.train():
-    x = Tensor.ones(8,12,128,256, requires_grad=False)
-    optimizer = optim.SGD(get_parameters(lm), lr=0.001)
-    loss = lm.forward(x).sum()
-    optimizer.zero_grad()
-    loss.backward()
-    del x,loss
-    optimizer.step()
-
-class TestBatchnorm(unittest.TestCase):
-  def test_conv(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
-      def forward(self, x):
-        return self.c(x).relu()
-    lm = LilModel()
-    model_step(lm)
-
-  def test_two_conv(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
-        self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
-      def forward(self, x):
-        return self.c2(self.c(x)).relu()
-    lm = LilModel()
-    model_step(lm)
-
-  def test_two_conv_bn(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 24, 3, padding=1, bias=False)
-        self.bn = BatchNorm2d(24, track_running_stats=False)
-        self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
-        self.bn2 = BatchNorm2d(32, track_running_stats=False)
-      def forward(self, x):
-        x = self.bn(self.c(x)).relu()
-        return self.bn2(self.c2(x)).relu()
-    lm = LilModel()
-    model_step(lm)
-
-  def test_conv_bn(self):
-    class LilModel:
-      def __init__(self):
-        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
-        self.bn = BatchNorm2d(32, track_running_stats=False)
-      def forward(self, x):
-        return self.bn(self.c(x)).relu()
-    lm = LilModel()
-    model_step(lm)
-
-
-if __name__ == '__main__':
-  unittest.main()