mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
Merge branch 'master' into retinanet_mlperf
This commit is contained in:
99
extra/debug_sd_speed.py
Normal file
99
extra/debug_sd_speed.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# NOTE: this is written in a way that checkout back to old commit still works
|
||||
# fast SD 297ms step on M1 Max, 4444e6d https://github.com/tinygrad/tinygrad/pull/2129
|
||||
# lazy rewrite, 1765849 https://github.com/tinygrad/tinygrad/pull/2878
|
||||
# SD 415ms step on M1 Max on master around 11/15/2024
|
||||
|
||||
import time
|
||||
from typing import Optional
|
||||
try: from tinygrad.jit import TinyJit
|
||||
except ImportError: from tinygrad import TinyJit
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.nn import Linear, LayerNorm
|
||||
from tinygrad.nn.state import get_parameters
|
||||
|
||||
class CrossAttention:
|
||||
def __init__(self, query_dim:int, ctx_dim:int, n_heads:int, d_head:int):
|
||||
self.to_q = Linear(query_dim, n_heads*d_head, bias=False)
|
||||
self.to_k = Linear(ctx_dim, n_heads*d_head, bias=False)
|
||||
self.to_v = Linear(ctx_dim, n_heads*d_head, bias=False)
|
||||
self.num_heads = n_heads
|
||||
self.head_size = d_head
|
||||
self.to_out = [Linear(n_heads*d_head, query_dim)]
|
||||
|
||||
def __call__(self, x:Tensor, ctx:Optional[Tensor]=None) -> Tensor:
|
||||
ctx = x if ctx is None else ctx
|
||||
q,k,v = self.to_q(x), self.to_k(ctx), self.to_v(ctx)
|
||||
q,k,v = [y.reshape(x.shape[0], -1, self.num_heads, self.head_size).transpose(1,2) for y in (q,k,v)]
|
||||
attention = Tensor.scaled_dot_product_attention(q, k, v).transpose(1,2)
|
||||
h_ = attention.reshape(x.shape[0], -1, self.num_heads * self.head_size)
|
||||
return h_.sequential(self.to_out)
|
||||
|
||||
class GEGLU:
|
||||
def __init__(self, dim_in:int, dim_out:int):
|
||||
self.proj = Linear(dim_in, dim_out * 2)
|
||||
self.dim_out = dim_out
|
||||
|
||||
def __call__(self, x:Tensor) -> Tensor:
|
||||
x, gate = self.proj(x).chunk(2, dim=-1)
|
||||
return x * gate.gelu()
|
||||
|
||||
class FeedForward:
|
||||
def __init__(self, dim:int, mult:int=4):
|
||||
self.net = [
|
||||
GEGLU(dim, dim*mult),
|
||||
lambda x: x, # needed for weights loading code to work
|
||||
Linear(dim*mult, dim)
|
||||
]
|
||||
|
||||
def __call__(self, x:Tensor) -> Tensor:
|
||||
return x.sequential(self.net)
|
||||
|
||||
class BasicTransformerBlock:
|
||||
def __init__(self, dim:int, ctx_dim:int, n_heads:int, d_head:int):
|
||||
self.attn1 = CrossAttention(dim, dim, n_heads, d_head)
|
||||
self.ff = FeedForward(dim)
|
||||
self.attn2 = CrossAttention(dim, ctx_dim, n_heads, d_head)
|
||||
self.norm1 = LayerNorm(dim)
|
||||
self.norm2 = LayerNorm(dim)
|
||||
self.norm3 = LayerNorm(dim)
|
||||
|
||||
def __call__(self, x:Tensor, ctx:Optional[Tensor]=None) -> Tensor:
|
||||
x = x + self.attn1(self.norm1(x)) # 5.4 before, # 6.8 master
|
||||
x = x + self.attn2(self.norm2(x), ctx=ctx) # 12 before, 12 master
|
||||
x = x + self.ff(self.norm3(x)) # 23 before, # 27 master
|
||||
return x
|
||||
|
||||
def helper_test(gen, model):
|
||||
tms = []
|
||||
for _ in range(5):
|
||||
early_gen = [x.realize() if isinstance(x, Tensor) else x for x in gen()]
|
||||
GlobalCounters.reset()
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
st = time.perf_counter_ns()
|
||||
model(*early_gen)
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
tms.append(time.perf_counter_ns() - st)
|
||||
print(f"{min(tms)/1e6=:.2f} ms")
|
||||
|
||||
def derandomize_model(model):
|
||||
for p in get_parameters(model):
|
||||
p.lazydata = Tensor.empty(*p.shape, device=p.device, dtype=p.dtype).lazydata
|
||||
p.realize()
|
||||
|
||||
def test_transformer_block():
|
||||
# dim, d_head, x = 320, 40, (4096, 320) # 137ms 4444e6d 115ms master
|
||||
# dim, d_head, x = 640, 80, (1024, 640) # 36ms 4444e6d, 31ms master
|
||||
dim, d_head, x = 1280, 160, (256, 1280) # 23ms 4444e6d, 28ms master, 31ms on 176584993
|
||||
|
||||
model = [BasicTransformerBlock(dim, 768, 8, d_head) for _ in range(4)]
|
||||
|
||||
derandomize_model(model)
|
||||
@TinyJit
|
||||
def test(t, t2):
|
||||
for l in model: t = l(t, t2)
|
||||
return t.realize()
|
||||
helper_test(lambda: (Tensor.empty(2, *x), Tensor.empty(2, 77, 768)), test)
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_transformer_block()
|
||||
@@ -107,7 +107,7 @@ class ResNet:
|
||||
is_feature_only = self.fc is None
|
||||
if is_feature_only: features = []
|
||||
out = self.bn1(self.conv1(x)).relu()
|
||||
out = out.pad2d([1,1,1,1]).max_pool2d((3,3), 2)
|
||||
out = out.pad([1,1,1,1]).max_pool2d((3,3), 2)
|
||||
out = out.sequential(self.layer1)
|
||||
if is_feature_only: features.append(out)
|
||||
out = out.sequential(self.layer2)
|
||||
|
||||
@@ -170,7 +170,7 @@ class ResNetFPN:
|
||||
|
||||
def __call__(self, x):
|
||||
out = self.body.bn1(self.body.conv1(x)).relu()
|
||||
out = out.pad2d([1,1,1,1]).max_pool2d((3,3), 2)
|
||||
out = out.pad([1,1,1,1]).max_pool2d((3,3), 2)
|
||||
out = out.sequential(self.body.layer1)
|
||||
p3 = out.sequential(self.body.layer2)
|
||||
p4 = p3.sequential(self.body.layer3)
|
||||
|
||||
@@ -282,7 +282,7 @@ def MaxUnpool(xT: Tensor, xI: Tensor, outshape: Optional[Tensor]=None, kernel_sh
|
||||
if outshape is not None and (outshape := to_python_const(outshape)) != ret.shape:
|
||||
diff = [outshape[2] - ret.shape[2], outshape[3] - ret.shape[3]]
|
||||
pad_args = [diff[0]//2, diff[1]//2, diff[0]-diff[0]//2, diff[1]-diff[1]//2]
|
||||
ret = ret.pad2d((pad_args[1], pad_args[3], pad_args[0], pad_args[2]))
|
||||
ret = ret.pad((pad_args[1], pad_args[3], pad_args[0], pad_args[2]))
|
||||
return ret
|
||||
|
||||
def Conv(X: Tensor, W: Tensor, B:Optional[Tensor]=None, auto_pad="NOTSET", dilations=1, group=1, kernel_shape=None, pads=None, strides=1):
|
||||
@@ -334,7 +334,7 @@ def Dropout(data: Tensor, ratio=0.5, training_mode=False, seed=None):
|
||||
|
||||
def LRN(x: Tensor, size, alpha=1e-4, beta=0.75, bias=1.0):
|
||||
bs, c, iy, ix = x.shape
|
||||
return x / x.mul(x).reshape(bs,1,c,iy*ix).pad2d((0,0,(size-1)//2, size//2)).avg_pool2d((size, 1), 1).reshape(bs,c,iy,ix).mul(alpha).add(bias).pow(beta)
|
||||
return x / x.mul(x).reshape(bs,1,c,iy*ix).pad((0,0,(size-1)//2, size//2)).avg_pool2d((size, 1), 1).reshape(bs,c,iy,ix).mul(alpha).add(bias).pow(beta)
|
||||
|
||||
def MeanVarianceNormalization(x: Tensor, axis=(0, 2, 3)):
|
||||
mean = x.mean(axis, keepdim=True)
|
||||
@@ -563,29 +563,13 @@ def ImageDecoder(encoded_stream: Tensor, pixel_format="RGB"):
|
||||
raise ValueError(f"pixel_format={pixel_format!r} is not supported.")
|
||||
|
||||
def AffineGrid(theta: Tensor, size: Tensor, align_corners=0):
|
||||
_, _, *data_sz = to_python_const(size)
|
||||
size_zeros, original_grid = Tensor.zeros(data_sz), Tensor.ones(data_sz)
|
||||
stackable = [original_grid]
|
||||
for dim, dim_sz in enumerate(data_sz):
|
||||
a = Tensor.arange(-1, 1.0001, 2/(dim_sz-1)) if align_corners == 1 else Tensor.arange(-1+1/dim_sz, 1, 2/dim_sz)
|
||||
if dim == 0: stackable = [a.reshape(dim_sz, *[1]*(len(data_sz)-1)) + size_zeros, *stackable]
|
||||
elif dim == 1: stackable = [a.reshape(1, dim_sz, *[1]*(len(data_sz)-2)) + size_zeros, *stackable]
|
||||
else: stackable = [a.reshape(1, dim_sz) + size_zeros, *stackable]
|
||||
original_grid = Tensor.stack(*stackable, dim=len(data_sz))
|
||||
if original_grid.ndim == 3:
|
||||
N, dim_2d, dim_homo = theta.shape
|
||||
assert dim_2d == 2 and dim_homo == 3
|
||||
H, W, dim_homo = original_grid.shape
|
||||
assert dim_homo == 3
|
||||
original_grid = original_grid.reshape(H*W, dim_homo).transpose()
|
||||
return theta.matmul(original_grid).permute(0,2,1).reshape(N, H, W, dim_2d)
|
||||
assert original_grid.ndim == 4
|
||||
N, dim_3d, dim_homo = theta.shape
|
||||
assert dim_3d == 3 and dim_homo == 4
|
||||
D, H, W, dim_homo = original_grid.shape
|
||||
assert dim_homo == 4
|
||||
original_grid = original_grid.reshape(D*H*W, dim_homo).transpose()
|
||||
return theta.matmul(original_grid).permute(0,2,1).reshape(N, D, H, W, dim_3d)
|
||||
N, _, *spatial_dims = to_python_const(size)
|
||||
def generate_grid(steps):
|
||||
return Tensor.linspace(-1, 1, steps, device=theta.device) if align_corners else Tensor.linspace(-1+1/steps, 1-1/steps, steps, device=theta.device)
|
||||
grids = Tensor.meshgrid(*(generate_grid(d) for d in spatial_dims))
|
||||
base_grid = Tensor.stack(*reversed(grids), Tensor.ones_like(grids[0], device=theta.device), dim=-1)
|
||||
base_grid = base_grid.reshape(1, prod(spatial_dims), len(grids)+1).expand(N, -1, -1)
|
||||
return (base_grid @ theta.transpose(1, 2)).reshape(N, *spatial_dims, -1)
|
||||
|
||||
# **************** com.microsoft Ops ****************
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
from typing import Tuple
|
||||
from tinygrad import Variable
|
||||
from tinygrad.codegen.kernel import Opt, OptOps
|
||||
from tinygrad.ops import UOp, Ops, KernelInfo, TernaryOps, BinaryOps, UnaryOps, MetaOps
|
||||
from tinygrad.ops import UOp, Ops, KernelInfo
|
||||
from tinygrad.dtype import dtypes, PtrDType
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import View
|
||||
|
||||
Reference in New Issue
Block a user