Merge branch 'master' into retinanet_mlperf

This commit is contained in:
Francis Lata
2024-11-18 04:42:57 -08:00
57 changed files with 1197 additions and 1047 deletions

99
extra/debug_sd_speed.py Normal file
View File

@@ -0,0 +1,99 @@
# NOTE: this is written in a way that checkout back to old commit still works
# fast SD 297ms step on M1 Max, 4444e6d https://github.com/tinygrad/tinygrad/pull/2129
# lazy rewrite, 1765849 https://github.com/tinygrad/tinygrad/pull/2878
# SD 415ms step on M1 Max on master around 11/15/2024
import time
from typing import Optional
try: from tinygrad.jit import TinyJit
except ImportError: from tinygrad import TinyJit
from tinygrad.tensor import Tensor, Device
from tinygrad.helpers import GlobalCounters
from tinygrad.nn import Linear, LayerNorm
from tinygrad.nn.state import get_parameters
class CrossAttention:
def __init__(self, query_dim:int, ctx_dim:int, n_heads:int, d_head:int):
self.to_q = Linear(query_dim, n_heads*d_head, bias=False)
self.to_k = Linear(ctx_dim, n_heads*d_head, bias=False)
self.to_v = Linear(ctx_dim, n_heads*d_head, bias=False)
self.num_heads = n_heads
self.head_size = d_head
self.to_out = [Linear(n_heads*d_head, query_dim)]
def __call__(self, x:Tensor, ctx:Optional[Tensor]=None) -> Tensor:
ctx = x if ctx is None else ctx
q,k,v = self.to_q(x), self.to_k(ctx), self.to_v(ctx)
q,k,v = [y.reshape(x.shape[0], -1, self.num_heads, self.head_size).transpose(1,2) for y in (q,k,v)]
attention = Tensor.scaled_dot_product_attention(q, k, v).transpose(1,2)
h_ = attention.reshape(x.shape[0], -1, self.num_heads * self.head_size)
return h_.sequential(self.to_out)
class GEGLU:
def __init__(self, dim_in:int, dim_out:int):
self.proj = Linear(dim_in, dim_out * 2)
self.dim_out = dim_out
def __call__(self, x:Tensor) -> Tensor:
x, gate = self.proj(x).chunk(2, dim=-1)
return x * gate.gelu()
class FeedForward:
def __init__(self, dim:int, mult:int=4):
self.net = [
GEGLU(dim, dim*mult),
lambda x: x, # needed for weights loading code to work
Linear(dim*mult, dim)
]
def __call__(self, x:Tensor) -> Tensor:
return x.sequential(self.net)
class BasicTransformerBlock:
def __init__(self, dim:int, ctx_dim:int, n_heads:int, d_head:int):
self.attn1 = CrossAttention(dim, dim, n_heads, d_head)
self.ff = FeedForward(dim)
self.attn2 = CrossAttention(dim, ctx_dim, n_heads, d_head)
self.norm1 = LayerNorm(dim)
self.norm2 = LayerNorm(dim)
self.norm3 = LayerNorm(dim)
def __call__(self, x:Tensor, ctx:Optional[Tensor]=None) -> Tensor:
x = x + self.attn1(self.norm1(x)) # 5.4 before, # 6.8 master
x = x + self.attn2(self.norm2(x), ctx=ctx) # 12 before, 12 master
x = x + self.ff(self.norm3(x)) # 23 before, # 27 master
return x
def helper_test(gen, model):
tms = []
for _ in range(5):
early_gen = [x.realize() if isinstance(x, Tensor) else x for x in gen()]
GlobalCounters.reset()
Device[Device.DEFAULT].synchronize()
st = time.perf_counter_ns()
model(*early_gen)
Device[Device.DEFAULT].synchronize()
tms.append(time.perf_counter_ns() - st)
print(f"{min(tms)/1e6=:.2f} ms")
def derandomize_model(model):
for p in get_parameters(model):
p.lazydata = Tensor.empty(*p.shape, device=p.device, dtype=p.dtype).lazydata
p.realize()
def test_transformer_block():
# dim, d_head, x = 320, 40, (4096, 320) # 137ms 4444e6d 115ms master
# dim, d_head, x = 640, 80, (1024, 640) # 36ms 4444e6d, 31ms master
dim, d_head, x = 1280, 160, (256, 1280) # 23ms 4444e6d, 28ms master, 31ms on 176584993
model = [BasicTransformerBlock(dim, 768, 8, d_head) for _ in range(4)]
derandomize_model(model)
@TinyJit
def test(t, t2):
for l in model: t = l(t, t2)
return t.realize()
helper_test(lambda: (Tensor.empty(2, *x), Tensor.empty(2, 77, 768)), test)
if __name__ == "__main__":
test_transformer_block()

View File

@@ -107,7 +107,7 @@ class ResNet:
is_feature_only = self.fc is None
if is_feature_only: features = []
out = self.bn1(self.conv1(x)).relu()
out = out.pad2d([1,1,1,1]).max_pool2d((3,3), 2)
out = out.pad([1,1,1,1]).max_pool2d((3,3), 2)
out = out.sequential(self.layer1)
if is_feature_only: features.append(out)
out = out.sequential(self.layer2)

View File

@@ -170,7 +170,7 @@ class ResNetFPN:
def __call__(self, x):
out = self.body.bn1(self.body.conv1(x)).relu()
out = out.pad2d([1,1,1,1]).max_pool2d((3,3), 2)
out = out.pad([1,1,1,1]).max_pool2d((3,3), 2)
out = out.sequential(self.body.layer1)
p3 = out.sequential(self.body.layer2)
p4 = p3.sequential(self.body.layer3)

View File

@@ -282,7 +282,7 @@ def MaxUnpool(xT: Tensor, xI: Tensor, outshape: Optional[Tensor]=None, kernel_sh
if outshape is not None and (outshape := to_python_const(outshape)) != ret.shape:
diff = [outshape[2] - ret.shape[2], outshape[3] - ret.shape[3]]
pad_args = [diff[0]//2, diff[1]//2, diff[0]-diff[0]//2, diff[1]-diff[1]//2]
ret = ret.pad2d((pad_args[1], pad_args[3], pad_args[0], pad_args[2]))
ret = ret.pad((pad_args[1], pad_args[3], pad_args[0], pad_args[2]))
return ret
def Conv(X: Tensor, W: Tensor, B:Optional[Tensor]=None, auto_pad="NOTSET", dilations=1, group=1, kernel_shape=None, pads=None, strides=1):
@@ -334,7 +334,7 @@ def Dropout(data: Tensor, ratio=0.5, training_mode=False, seed=None):
def LRN(x: Tensor, size, alpha=1e-4, beta=0.75, bias=1.0):
bs, c, iy, ix = x.shape
return x / x.mul(x).reshape(bs,1,c,iy*ix).pad2d((0,0,(size-1)//2, size//2)).avg_pool2d((size, 1), 1).reshape(bs,c,iy,ix).mul(alpha).add(bias).pow(beta)
return x / x.mul(x).reshape(bs,1,c,iy*ix).pad((0,0,(size-1)//2, size//2)).avg_pool2d((size, 1), 1).reshape(bs,c,iy,ix).mul(alpha).add(bias).pow(beta)
def MeanVarianceNormalization(x: Tensor, axis=(0, 2, 3)):
mean = x.mean(axis, keepdim=True)
@@ -563,29 +563,13 @@ def ImageDecoder(encoded_stream: Tensor, pixel_format="RGB"):
raise ValueError(f"pixel_format={pixel_format!r} is not supported.")
def AffineGrid(theta: Tensor, size: Tensor, align_corners=0):
_, _, *data_sz = to_python_const(size)
size_zeros, original_grid = Tensor.zeros(data_sz), Tensor.ones(data_sz)
stackable = [original_grid]
for dim, dim_sz in enumerate(data_sz):
a = Tensor.arange(-1, 1.0001, 2/(dim_sz-1)) if align_corners == 1 else Tensor.arange(-1+1/dim_sz, 1, 2/dim_sz)
if dim == 0: stackable = [a.reshape(dim_sz, *[1]*(len(data_sz)-1)) + size_zeros, *stackable]
elif dim == 1: stackable = [a.reshape(1, dim_sz, *[1]*(len(data_sz)-2)) + size_zeros, *stackable]
else: stackable = [a.reshape(1, dim_sz) + size_zeros, *stackable]
original_grid = Tensor.stack(*stackable, dim=len(data_sz))
if original_grid.ndim == 3:
N, dim_2d, dim_homo = theta.shape
assert dim_2d == 2 and dim_homo == 3
H, W, dim_homo = original_grid.shape
assert dim_homo == 3
original_grid = original_grid.reshape(H*W, dim_homo).transpose()
return theta.matmul(original_grid).permute(0,2,1).reshape(N, H, W, dim_2d)
assert original_grid.ndim == 4
N, dim_3d, dim_homo = theta.shape
assert dim_3d == 3 and dim_homo == 4
D, H, W, dim_homo = original_grid.shape
assert dim_homo == 4
original_grid = original_grid.reshape(D*H*W, dim_homo).transpose()
return theta.matmul(original_grid).permute(0,2,1).reshape(N, D, H, W, dim_3d)
N, _, *spatial_dims = to_python_const(size)
def generate_grid(steps):
return Tensor.linspace(-1, 1, steps, device=theta.device) if align_corners else Tensor.linspace(-1+1/steps, 1-1/steps, steps, device=theta.device)
grids = Tensor.meshgrid(*(generate_grid(d) for d in spatial_dims))
base_grid = Tensor.stack(*reversed(grids), Tensor.ones_like(grids[0], device=theta.device), dim=-1)
base_grid = base_grid.reshape(1, prod(spatial_dims), len(grids)+1).expand(N, -1, -1)
return (base_grid @ theta.transpose(1, 2)).reshape(N, *spatial_dims, -1)
# **************** com.microsoft Ops ****************

View File

@@ -2,7 +2,7 @@
from typing import Tuple
from tinygrad import Variable
from tinygrad.codegen.kernel import Opt, OptOps
from tinygrad.ops import UOp, Ops, KernelInfo, TernaryOps, BinaryOps, UnaryOps, MetaOps
from tinygrad.ops import UOp, Ops, KernelInfo
from tinygrad.dtype import dtypes, PtrDType
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View