tinygrad/extra/huggingface_onnx/collect_metadata.py

import yaml, time, requests, argparse
from pathlib import Path
from huggingface_hub import list_models, HfApi
from tinygrad.helpers import tqdm

HUGGINGFACE_URL = "https://huggingface.co"
SKIPPED_FILES = [
  "fp16", "int8", "uint8", "quantized",      # numerical accuracy issues
  "avx2", "arm64", "avx512", "avx512_vnni",  # numerical accuracy issues
  "q4", "q4f16", "bnb4",                     # unimplemented quantization
  "model_O4",                                # requires non cpu ort runner and MemcpyFromHost op
  "merged",                                  # TODO implement attribute with graph type and Loop op
]
SKIPPED_REPO_PATHS = [
  # Invalid model-index
  "AdamCodd/vit-base-nsfw-detector",
  # TODO: implement attribute with graph type and Loop op
  "minishlab/potion-base-8M", "minishlab/M2V_base_output", "minishlab/potion-retrieval-32M",
  # TODO: implement SimplifiedLayerNormalization, SkipSimplifiedLayerNormalization, GroupQueryAttention
  "HuggingFaceTB/SmolLM2-360M-Instruct",
  # TODO: implement SimplifiedLayerNormalization, SkipSimplifiedLayerNormalization, RotaryEmbedding, MultiHeadAttention
  "HuggingFaceTB/SmolLM2-1.7B-Instruct",
  # TODO: implmement RandomNormalLike
  "stabilityai/stable-diffusion-xl-base-1.0", "stabilityai/sdxl-turbo", 'SimianLuo/LCM_Dreamshaper_v7',
  # TODO: implement NonZero
  "mangoapps/fb_zeroshot_mnli_onnx",
  # TODO huge Concat in here with 1024 (1, 3, 32, 32) Tensors, and maybe a MOD bug with const folding
  "briaai/RMBG-2.0",
]

def get_top_repos(n: int, sort: str) -> list[str]: # list["FacebookAI/xlm-roberta-large", ...]
  print(f"** Getting top {n} models sorted by {sort} **")
  repos = []
  i = 0
  for model in list_models(filter="onnx", sort=sort):
    if model.id in SKIPPED_REPO_PATHS: continue
    print(f"{i+1}/{n}: {model.id} ({getattr(model, sort)})")
    repos.append(model.id)
    i += 1
    if i == n: break
  return repos

def get_metadata(repos:list[str]) -> dict:
  api = HfApi()
  repos_metadata = {"repositories": {}}
  total_size = 0

  # TODO: speed head requests up with async?
  for repo in tqdm(repos, desc="Getting metadata"):
    files_metadata = []
    model_info = api.model_info(repo)

    for file in model_info.siblings:
      filename = file.rfilename
      if not (filename.endswith('.onnx') or filename.endswith('.onnx_data')): continue
      if any(skip_str in filename for skip_str in SKIPPED_FILES): continue
      head = requests.head(f"{HUGGINGFACE_URL}/{repo}/resolve/main/{filename}", allow_redirects=True)
      file_size = file.size or int(head.headers.get('Content-Length', 0))
      files_metadata.append({"file": filename, "size": f"{file_size/1e6:.2f}MB"})
      total_size += file_size

    repos_metadata["repositories"][repo] = {
      "url": f"{HUGGINGFACE_URL}/{repo}",
      "download_path": None,
      "files": files_metadata,
    }
  repos_metadata['total_size'] = f"{total_size/1e9:.2f}GB"
  repos_metadata['created_at'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
  return repos_metadata

if __name__ == "__main__":
  sort = "downloads" # recent 30 days downloads
  huggingface_onnx_dir = Path(__file__).parent

  parser = argparse.ArgumentParser(description="Produces a YAML file with metadata of top huggingface onnx models")
  parser.add_argument("--limit", type=int, required=True, help="Number of top repositories to process (e.g., 100)")
  parser.add_argument("--output", type=str, default="huggingface_repos.yaml", help="Output YAML file name to save the report")
  args = parser.parse_args()

  top_repos = get_top_repos(args.limit, sort)
  metadata = get_metadata(top_repos)
  yaml_path = huggingface_onnx_dir / args.output
  with open(yaml_path, 'w') as f:
    yaml.dump(metadata, f, sort_keys=False)
    print(f"YAML saved to: {str(yaml_path)}")