Merge branch 'master' into retinanet_mlperf

2026-04-29 03:00:14 -04:00 · 2024-11-18 04:42:57 -08:00
parent a0c0a77f54 6ea4a173e7
commit 99efa2cfde
57 changed files with 1197 additions and 1047 deletions
--- a/extra/debug_sd_speed.py
+++ b/extra/debug_sd_speed.py
@@ -0,0 +1,99 @@
+# NOTE: this is written in a way that checkout back to old commit still works
+# fast SD 297ms step on M1 Max, 4444e6d  https://github.com/tinygrad/tinygrad/pull/2129
+# lazy rewrite, 1765849  https://github.com/tinygrad/tinygrad/pull/2878
+# SD 415ms step on M1 Max on master around 11/15/2024
+
+import time
+from typing import Optional
+try: from tinygrad.jit import TinyJit
+except ImportError: from tinygrad import TinyJit
+from tinygrad.tensor import Tensor, Device
+from tinygrad.helpers import GlobalCounters
+from tinygrad.nn import Linear, LayerNorm
+from tinygrad.nn.state import get_parameters
+
+class CrossAttention:
+  def __init__(self, query_dim:int, ctx_dim:int, n_heads:int, d_head:int):
+    self.to_q = Linear(query_dim, n_heads*d_head, bias=False)
+    self.to_k = Linear(ctx_dim,   n_heads*d_head, bias=False)
+    self.to_v = Linear(ctx_dim,   n_heads*d_head, bias=False)
+    self.num_heads = n_heads
+    self.head_size = d_head
+    self.to_out = [Linear(n_heads*d_head, query_dim)]
+
+  def __call__(self, x:Tensor, ctx:Optional[Tensor]=None) -> Tensor:
+    ctx = x if ctx is None else ctx
+    q,k,v = self.to_q(x), self.to_k(ctx), self.to_v(ctx)
+    q,k,v = [y.reshape(x.shape[0], -1, self.num_heads, self.head_size).transpose(1,2) for y in (q,k,v)]
+    attention = Tensor.scaled_dot_product_attention(q, k, v).transpose(1,2)
+    h_ = attention.reshape(x.shape[0], -1, self.num_heads * self.head_size)
+    return h_.sequential(self.to_out)
+
+class GEGLU:
+  def __init__(self, dim_in:int, dim_out:int):
+    self.proj = Linear(dim_in, dim_out * 2)
+    self.dim_out = dim_out
+
+  def __call__(self, x:Tensor) -> Tensor:
+    x, gate = self.proj(x).chunk(2, dim=-1)
+    return x * gate.gelu()
+
+class FeedForward:
+  def __init__(self, dim:int, mult:int=4):
+    self.net = [
+      GEGLU(dim, dim*mult),
+      lambda x: x,  # needed for weights loading code to work
+      Linear(dim*mult, dim)
+    ]
+
+  def __call__(self, x:Tensor) -> Tensor:
+    return x.sequential(self.net)
+
+class BasicTransformerBlock:
+  def __init__(self, dim:int, ctx_dim:int, n_heads:int, d_head:int):
+    self.attn1 = CrossAttention(dim, dim, n_heads, d_head)
+    self.ff    = FeedForward(dim)
+    self.attn2 = CrossAttention(dim, ctx_dim, n_heads, d_head)
+    self.norm1 = LayerNorm(dim)
+    self.norm2 = LayerNorm(dim)
+    self.norm3 = LayerNorm(dim)
+
+  def __call__(self, x:Tensor, ctx:Optional[Tensor]=None) -> Tensor:
+    x = x + self.attn1(self.norm1(x))  # 5.4 before, # 6.8 master
+    x = x + self.attn2(self.norm2(x), ctx=ctx)  # 12 before, 12 master
+    x = x + self.ff(self.norm3(x))  # 23 before, # 27 master
+    return x
+
+def helper_test(gen, model):
+  tms = []
+  for _ in range(5):
+    early_gen = [x.realize() if isinstance(x, Tensor) else x for x in gen()]
+    GlobalCounters.reset()
+    Device[Device.DEFAULT].synchronize()
+    st = time.perf_counter_ns()
+    model(*early_gen)
+    Device[Device.DEFAULT].synchronize()
+    tms.append(time.perf_counter_ns() - st)
+  print(f"{min(tms)/1e6=:.2f} ms")
+
+def derandomize_model(model):
+  for p in get_parameters(model):
+    p.lazydata = Tensor.empty(*p.shape, device=p.device, dtype=p.dtype).lazydata
+    p.realize()
+
+def test_transformer_block():
+  # dim, d_head, x = 320, 40, (4096, 320)  # 137ms 4444e6d 115ms master
+  # dim, d_head, x = 640, 80, (1024, 640)  #  36ms 4444e6d, 31ms master
+  dim, d_head, x = 1280, 160, (256, 1280)  # 23ms 4444e6d, 28ms master, 31ms on 176584993
+
+  model = [BasicTransformerBlock(dim, 768, 8, d_head) for _ in range(4)]
+
+  derandomize_model(model)
+  @TinyJit
+  def test(t, t2):
+    for l in model: t = l(t, t2)
+    return t.realize()
+  helper_test(lambda: (Tensor.empty(2, *x), Tensor.empty(2, 77, 768)), test)
+
+if __name__ == "__main__":
+  test_transformer_block()
--- a/extra/models/resnet.py
+++ b/extra/models/resnet.py
@@ -107,7 +107,7 @@ class ResNet:
    is_feature_only = self.fc is None
    if is_feature_only: features = []
    out = self.bn1(self.conv1(x)).relu()
-    out = out.pad2d([1,1,1,1]).max_pool2d((3,3), 2)
+    out = out.pad([1,1,1,1]).max_pool2d((3,3), 2)
    out = out.sequential(self.layer1)
    if is_feature_only: features.append(out)
    out = out.sequential(self.layer2)
--- a/extra/models/retinanet.py
+++ b/extra/models/retinanet.py
@@ -170,7 +170,7 @@ class ResNetFPN:

  def __call__(self, x):
    out = self.body.bn1(self.body.conv1(x)).relu()
-    out = out.pad2d([1,1,1,1]).max_pool2d((3,3), 2)
+    out = out.pad([1,1,1,1]).max_pool2d((3,3), 2)
    out = out.sequential(self.body.layer1)
    p3 = out.sequential(self.body.layer2)
    p4 = p3.sequential(self.body.layer3)
--- a/extra/onnx_ops.py
+++ b/extra/onnx_ops.py
@@ -282,7 +282,7 @@ def MaxUnpool(xT: Tensor, xI: Tensor, outshape: Optional[Tensor]=None, kernel_sh
  if outshape is not None and (outshape := to_python_const(outshape)) != ret.shape:
    diff = [outshape[2] - ret.shape[2], outshape[3] - ret.shape[3]]
    pad_args = [diff[0]//2, diff[1]//2, diff[0]-diff[0]//2, diff[1]-diff[1]//2]
-    ret = ret.pad2d((pad_args[1], pad_args[3], pad_args[0], pad_args[2]))
+    ret = ret.pad((pad_args[1], pad_args[3], pad_args[0], pad_args[2]))
  return ret

 def Conv(X: Tensor, W: Tensor, B:Optional[Tensor]=None, auto_pad="NOTSET", dilations=1, group=1, kernel_shape=None, pads=None, strides=1):
@@ -334,7 +334,7 @@ def Dropout(data: Tensor, ratio=0.5, training_mode=False, seed=None):

 def LRN(x: Tensor, size, alpha=1e-4, beta=0.75, bias=1.0):
  bs, c, iy, ix = x.shape
-  return x / x.mul(x).reshape(bs,1,c,iy*ix).pad2d((0,0,(size-1)//2, size//2)).avg_pool2d((size, 1), 1).reshape(bs,c,iy,ix).mul(alpha).add(bias).pow(beta)
+  return x / x.mul(x).reshape(bs,1,c,iy*ix).pad((0,0,(size-1)//2, size//2)).avg_pool2d((size, 1), 1).reshape(bs,c,iy,ix).mul(alpha).add(bias).pow(beta)

 def MeanVarianceNormalization(x: Tensor, axis=(0, 2, 3)):
  mean = x.mean(axis, keepdim=True)
@@ -563,29 +563,13 @@ def ImageDecoder(encoded_stream: Tensor, pixel_format="RGB"):
  raise ValueError(f"pixel_format={pixel_format!r} is not supported.")

 def AffineGrid(theta: Tensor, size: Tensor, align_corners=0):
-  _, _, *data_sz = to_python_const(size)
-  size_zeros, original_grid = Tensor.zeros(data_sz), Tensor.ones(data_sz)
-  stackable = [original_grid]
-  for dim, dim_sz in enumerate(data_sz):
-    a = Tensor.arange(-1, 1.0001, 2/(dim_sz-1)) if align_corners == 1 else Tensor.arange(-1+1/dim_sz, 1, 2/dim_sz)
-    if dim == 0: stackable = [a.reshape(dim_sz, *[1]*(len(data_sz)-1)) + size_zeros, *stackable]
-    elif dim == 1: stackable = [a.reshape(1, dim_sz, *[1]*(len(data_sz)-2)) + size_zeros, *stackable]
-    else: stackable = [a.reshape(1, dim_sz) + size_zeros, *stackable]
-  original_grid = Tensor.stack(*stackable, dim=len(data_sz))
-  if original_grid.ndim == 3:
-    N, dim_2d, dim_homo = theta.shape
-    assert dim_2d == 2 and dim_homo == 3
-    H, W, dim_homo = original_grid.shape
-    assert dim_homo == 3
-    original_grid = original_grid.reshape(H*W, dim_homo).transpose()
-    return theta.matmul(original_grid).permute(0,2,1).reshape(N, H, W, dim_2d)
-  assert original_grid.ndim == 4
-  N, dim_3d, dim_homo = theta.shape
-  assert dim_3d == 3 and dim_homo == 4
-  D, H, W, dim_homo = original_grid.shape
-  assert dim_homo == 4
-  original_grid = original_grid.reshape(D*H*W, dim_homo).transpose()
-  return theta.matmul(original_grid).permute(0,2,1).reshape(N, D, H, W, dim_3d)
+  N, _, *spatial_dims = to_python_const(size)
+  def generate_grid(steps):
+    return Tensor.linspace(-1, 1, steps, device=theta.device) if align_corners else Tensor.linspace(-1+1/steps, 1-1/steps, steps, device=theta.device)
+  grids = Tensor.meshgrid(*(generate_grid(d) for d in spatial_dims))
+  base_grid = Tensor.stack(*reversed(grids), Tensor.ones_like(grids[0], device=theta.device), dim=-1)
+  base_grid = base_grid.reshape(1, prod(spatial_dims), len(grids)+1).expand(N, -1, -1)
+  return (base_grid @ theta.transpose(1, 2)).reshape(N, *spatial_dims, -1)

 # **************** com.microsoft Ops ****************

--- a/extra/optimization/helpers.py
+++ b/extra/optimization/helpers.py
@@ -2,7 +2,7 @@
 from typing import Tuple
 from tinygrad import Variable
 from tinygrad.codegen.kernel import Opt, OptOps
-from tinygrad.ops import UOp, Ops, KernelInfo, TernaryOps, BinaryOps, UnaryOps, MetaOps
+from tinygrad.ops import UOp, Ops, KernelInfo
 from tinygrad.dtype import dtypes, PtrDType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View