Files
tinygrad/developer/speed/index.html
2026-01-13 21:35:32 +00:00

1583 lines
38 KiB
HTML

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="canonical" href="https://docs.tinygrad.org/developer/speed/">
<link rel="prev" href="../layout/">
<link rel="next" href="../uop/">
<link rel="icon" href="../../favicon.svg">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.1">
<title>Speed - tinygrad docs</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="stylesheet" href="../../assets/stylesheets/palette.ab4e12ef.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<link rel="stylesheet" href="../../assets/_markdown_exec_pyodide.css">
<link rel="stylesheet" href="../../assets/_markdown_exec_ansi.css">
<link rel="stylesheet" href="../../assets/_mkdocstrings.css">
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="lime">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#speed-in-tinygrad" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="tinygrad docs" class="md-header__button md-logo" aria-label="tinygrad docs" data-md-component="logo">
<img src="../../logo_tiny_dark.svg" alt="logo">
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
tinygrad docs
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Speed
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="(prefers-color-scheme)" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="lime" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Switch to light mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m14.3 16-.7-2h-3.2l-.7 2H7.8L11 7h2l3.2 9zM20 8.69V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12zm-9.15 3.96h2.3L12 9z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="lime" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_2" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="lime" aria-label="Switch to system preference" type="radio" name="__palette" id="__palette_2">
<label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
<div class="md-search__suggest" data-md-component="search-suggest"></div>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/tinygrad/tinygrad/" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
GitHub
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary md-nav--integrated" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="tinygrad docs" class="md-nav__button md-logo" aria-label="tinygrad docs" data-md-component="logo">
<img src="../../logo_tiny_dark.svg" alt="logo">
</a>
tinygrad docs
</label>
<div class="md-nav__source">
<a href="https://github.com/tinygrad/tinygrad/" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
GitHub
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1" checked>
<div class="md-nav__link md-nav__container">
<a href="../.." class="md-nav__link ">
<span class="md-ellipsis">
Home
</span>
</a>
<label class="md-nav__link " for="__nav_1" id="__nav_1_label" tabindex="">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_1_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_1">
<span class="md-nav__icon md-icon"></span>
Home
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../quickstart/" class="md-nav__link">
<span class="md-ellipsis">
Quickstart
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../showcase/" class="md-nav__link">
<span class="md-ellipsis">
Showcase
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../mnist/" class="md-nav__link">
<span class="md-ellipsis">
MNIST Tutorial
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_1_5" >
<label class="md-nav__link" for="__nav_1_5" id="__nav_1_5_label" tabindex="0">
<span class="md-ellipsis">
API Reference
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_1_5">
<span class="md-nav__icon md-icon"></span>
API Reference
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_1_5_1" >
<div class="md-nav__link md-nav__container">
<a href="../../tensor/" class="md-nav__link ">
<span class="md-ellipsis">
Tensor
</span>
</a>
<label class="md-nav__link " for="__nav_1_5_1" id="__nav_1_5_1_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="3" aria-labelledby="__nav_1_5_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_1_5_1">
<span class="md-nav__icon md-icon"></span>
Tensor
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../tensor/properties/" class="md-nav__link">
<span class="md-ellipsis">
Properties
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../tensor/creation/" class="md-nav__link">
<span class="md-ellipsis">
Creation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../tensor/movement/" class="md-nav__link">
<span class="md-ellipsis">
Movement
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../tensor/elementwise/" class="md-nav__link">
<span class="md-ellipsis">
Elementwise
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../tensor/ops/" class="md-nav__link">
<span class="md-ellipsis">
Complex Ops
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../dtypes/" class="md-nav__link">
<span class="md-ellipsis">
dtypes
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../nn/" class="md-nav__link">
<span class="md-ellipsis">
nn (Neural Networks)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../env_vars/" class="md-nav__link">
<span class="md-ellipsis">
Environment Variables
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../runtime/" class="md-nav__link">
<span class="md-ellipsis">
Runtime
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1_6" checked>
<label class="md-nav__link" for="__nav_1_6" id="__nav_1_6_label" tabindex="0">
<span class="md-ellipsis">
Developer
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_6_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_1_6">
<span class="md-nav__icon md-icon"></span>
Developer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../developer/" class="md-nav__link">
<span class="md-ellipsis">
Intro
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../layout/" class="md-nav__link">
<span class="md-ellipsis">
Layout
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Speed
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Speed
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#overview" class="md-nav__link">
<span class="md-ellipsis">
Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#compile-speed-python" class="md-nav__link">
<span class="md-ellipsis">
Compile Speed (Python)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#execution-speed-driver" class="md-nav__link">
<span class="md-ellipsis">
Execution Speed (driver)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#model-speed-scheduler" class="md-nav__link">
<span class="md-ellipsis">
Model Speed (scheduler)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#kernel-speed-codegen" class="md-nav__link">
<span class="md-ellipsis">
Kernel Speed (codegen)
</span>
</a>
<nav class="md-nav" aria-label="Kernel Speed (codegen)">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#memory" class="md-nav__link">
<span class="md-ellipsis">
Memory
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#tensor-cores" class="md-nav__link">
<span class="md-ellipsis">
Tensor Cores
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#indexing" class="md-nav__link">
<span class="md-ellipsis">
Indexing
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../uop/" class="md-nav__link">
<span class="md-ellipsis">
UOp
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_1_6_5" >
<label class="md-nav__link" for="__nav_1_6_5" id="__nav_1_6_5_label" tabindex="0">
<span class="md-ellipsis">
Runtime
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="3" aria-labelledby="__nav_1_6_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_1_6_5">
<span class="md-nav__icon md-icon"></span>
Runtime
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../runtime/" class="md-nav__link">
<span class="md-ellipsis">
Runtime Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../hcq/" class="md-nav__link">
<span class="md-ellipsis">
HCQ
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../am/" class="md-nav__link">
<span class="md-ellipsis">
AM Driver
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../tinybox/" class="md-nav__link">
<span class="md-ellipsis">
tinybox
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<nav class="md-path" aria-label="Navigation" >
<ol class="md-path__list">
<li class="md-path__item">
<a href="../.." class="md-path__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-path__item">
<a href="../developer/" class="md-path__link">
<span class="md-ellipsis">
Developer
</span>
</a>
</li>
</ol>
</nav>
<article class="md-content__inner md-typeset">
<a href="https://github.com/tinygrad/tinygrad/edit/master/docs/developer/speed.md" title="Edit this page" class="md-content__button md-icon" rel="edit">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M10 20H6V4h7v5h5v3.1l2-2V8l-6-6H6c-1.1 0-2 .9-2 2v16c0 1.1.9 2 2 2h4zm10.2-7c.1 0 .3.1.4.2l1.3 1.3c.2.2.2.6 0 .8l-1 1-2.1-2.1 1-1c.1-.1.2-.2.4-.2m0 3.9L14.1 23H12v-2.1l6.1-6.1z"/></svg>
</a>
<a href="https://github.com/tinygrad/tinygrad/raw/master/docs/developer/speed.md" title="View source of this page" class="md-content__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 18c.56 0 1 .44 1 1s-.44 1-1 1-1-.44-1-1 .44-1 1-1m0-3c-2.73 0-5.06 1.66-6 4 .94 2.34 3.27 4 6 4s5.06-1.66 6-4c-.94-2.34-3.27-4-6-4m0 6.5a2.5 2.5 0 0 1-2.5-2.5 2.5 2.5 0 0 1 2.5-2.5 2.5 2.5 0 0 1 2.5 2.5 2.5 2.5 0 0 1-2.5 2.5M9.27 20H6V4h7v5h5v4.07c.7.08 1.36.25 2 .49V8l-6-6H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h4.5a8.2 8.2 0 0 1-1.23-2"/></svg>
</a>
<h1 id="speed-in-tinygrad">speed in tinygrad<a class="headerlink" href="#speed-in-tinygrad" title="Permanent link">¤</a></h1>
<h2 id="overview">Overview<a class="headerlink" href="#overview" title="Permanent link">¤</a></h2>
<p>Speed refers to many different things. To break it down to four, there's:</p>
<ul>
<li>Compile Speed (Python)</li>
<li>Execution Speed (driver)</li>
<li>Model Speed (scheduler)</li>
<li>Kernel Speed (codegen)</li>
</ul>
<h2 id="compile-speed-python">Compile Speed (Python)<a class="headerlink" href="#compile-speed-python" title="Permanent link">¤</a></h2>
<p>This is how long the first run of your model takes. It's limited largely by the runtime of the Python doing UOp rewrites. Currently it's a bit slow, but on par with torch.compile. It gets even slower if you are using BEAM, since that's compiling many variants of each kernel.</p>
<p>This will be improved by writing faster graph_rewrite, doing less graph_rewrite, and better parallelization.</p>
<h2 id="execution-speed-driver">Execution Speed (driver)<a class="headerlink" href="#execution-speed-driver" title="Permanent link">¤</a></h2>
<p>After your model is compiled, you are often using the <code class="language-python highlight"><span class="n">TinyJIT</span></code>. tinygrad has the best execution speed of any framework because it usually bypasses the GPU driver and prebuilds the command queue. It's tons faster than normal CUDA, and often even faster than CUDA Graph.</p>
<p>There's very little to improve here, as this is almost never the bottleneck.</p>
<h2 id="model-speed-scheduler">Model Speed (scheduler)<a class="headerlink" href="#model-speed-scheduler" title="Permanent link">¤</a></h2>
<p>The scheduler determines how operations are grouped into kernels and which Tensors are written to memory. This is currently a big bottleneck of training speed.</p>
<p>The decisions are often not obvious. For example, when is it worth recomputing an arithmetic operation instead of storing and loading from memory? Example:</p>
<div class="language-python highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">tinygrad</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>
<span class="n">a</span> <span class="o">=</span> <span class="n">Tensor</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="n">b</span> <span class="o">=</span> <span class="n">Tensor</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="n">c</span> <span class="o">=</span> <span class="n">Tensor</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="n">d</span> <span class="o">=</span> <span class="n">Tensor</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="n">out1</span> <span class="o">=</span> <span class="n">a</span><span class="o">+</span><span class="n">b</span><span class="o">+</span><span class="n">c</span>
<span class="n">out2</span> <span class="o">=</span> <span class="n">a</span><span class="o">+</span><span class="n">b</span><span class="o">+</span><span class="n">d</span>
<span class="n">Tensor</span><span class="o">.</span><span class="n">realize</span><span class="p">(</span><span class="n">out1</span><span class="p">,</span> <span class="n">out2</span><span class="p">)</span>
</code></pre></div>
<p>The real answer is obvious, compute both <code class="language-python highlight"><span class="n">out1</span></code> and <code class="language-python highlight"><span class="n">out2</span></code> in the same kernel. But you can't always do that. If you can't, should <code class="language-python highlight"><span class="n">a</span><span class="o">+</span><span class="n">b</span></code> first be saved to a subbuffer? Or should both the <code class="language-python highlight"><span class="n">out1</span></code> and <code class="language-python highlight"><span class="n">out2</span></code> kernels recompute <code class="language-python highlight"><span class="n">a</span><span class="o">+</span><span class="n">b</span></code>?</p>
<p>In this case: with recompute (6 reads + 2 writes), no recompute (6 reads + 3 writes), so we should probably recompute. However, once you add movement ops and casts this is even harder to figure out. tinygrad doesn't yet have a systematic way to do it.</p>
<h2 id="kernel-speed-codegen">Kernel Speed (codegen)<a class="headerlink" href="#kernel-speed-codegen" title="Permanent link">¤</a></h2>
<p>Given that you have decided how the model ops will be grouped and what will be written to memory, kernel speed determines how fast that operation is done. This is what BEAM changes, it searches over a set of equivalent kernels which all perform the same operation and finds the one which performs the task the fastest.</p>
<p>In <code class="language-python highlight"><span class="n">kernel</span><span class="o">.</span><span class="n">py</span></code> we have a set of <code class="language-python highlight"><span class="n">OptOps</span></code>, these control the parameters of the speed optimizations applied to the kernel.</p>
<h3 id="memory">Memory<a class="headerlink" href="#memory" title="Permanent link">¤</a></h3>
<p>The main bottleneck in most kernels is accessing memory. In a freshman algorithms class, you'll learn about cache aware matrix multiplication, and this is all forms of that. While the same math is run, the order in which you run it can have large impacts on the speed depending on if the data you are loading. OptOps will change this order.</p>
<p>Memory, even cache, is often much slower than accessing the register file. The amount of times data is used in math is called the "arithmetic intensity". For operations like BS=1 GEMV, the arithmetic intensity is 1, but for GEMMs and convs it can be much higher. OptOps like UPCAST and UNROLL can increase this, but be careful of making them too large, as if there's too much register pressure on the GPU the warp scheduler may not be able to fit many warps, or even worse, it could be spilling to local memory.</p>
<p>4090s have 1 TB/s of ram bandwidth and ~160 TFLOPS of compute, so you need to use each loaded value ~100 times. The L1 cache has around 40 TB/s of bandwidth, so in order to get full compute utilization you need to use each value ~4 times.</p>
<p>A lot of work can still be done here. For example, we never copy the inputs to on chip SRAM, but this is often quite helpful for kernel speed. Also, we aren't doing a good job with L2 cache awareness (the locals handle L1 quite well)</p>
<h3 id="tensor-cores">Tensor Cores<a class="headerlink" href="#tensor-cores" title="Permanent link">¤</a></h3>
<p>Many accelerators have Tensor Cores / MAC arrays / systolic arrays. The main value of these is that, since they are 2-D, they create an n^2 ratio between the compute and the input data.</p>
<p>GPUs use Tensor Cores instead of MAC arrays to fit better in the GPU warp paradigm. This is because the output of Tensor Cores is O(n) wrt the input, while the output of MAC arrays like the AMX is O(n^2)</p>
<p>We have a simple framework in tinygrad for adding these ALU blocks and achieving good performance from them.</p>
<h3 id="indexing">Indexing<a class="headerlink" href="#indexing" title="Permanent link">¤</a></h3>
<p>Indexing determines the address of the memory we need to load. GPUs often have less integer math resources than floating point math, so this can sometimes be the bottleneck. We have a symbolic math engine in our rewrite rules to simplify indexing before it's emitted to the kernel. Newer NVIDIA GPUs have a "Tensor Memory Accelerator" to assist with fast indexing, however, this is not supported in tinygrad yet.</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8z"/></svg>
Back to top
</button>
</main>
<footer class="md-footer">
<nav class="md-footer__inner md-grid" aria-label="Footer" >
<a href="../layout/" class="md-footer__link md-footer__link--prev" aria-label="Previous: Layout">
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</div>
<div class="md-footer__title">
<span class="md-footer__direction">
Previous
</span>
<div class="md-ellipsis">
Layout
</div>
</div>
</a>
<a href="../uop/" class="md-footer__link md-footer__link--next" aria-label="Next: UOp">
<div class="md-footer__title">
<span class="md-footer__direction">
Next
</span>
<div class="md-ellipsis">
UOp
</div>
</div>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11z"/></svg>
</div>
</a>
</nav>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": ["announce.dismiss", "content.action.edit", "content.action.view", "content.code.annotate", "content.code.copy", "content.tooltips", "navigation.footer", "navigation.indexes", "navigation.sections", "navigation.expand", "navigation.top", "navigation.path", "search.highlight", "search.suggest", "toc.follow", "toc.integrate"], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="../../assets/_markdown_exec_pyodide.js"></script>
</body>
</html>