delete more tests (#12043)

* delete more tests

* delete and simplify

* flaky on windows

* a few more, those remained
This commit is contained in:
George Hotz
2025-09-05 15:31:30 -07:00
committed by GitHub
parent 12c7b1bb01
commit ee4f696086
11 changed files with 12 additions and 410 deletions

View File

@@ -20,12 +20,6 @@ repos:
language: system
always_run: true
pass_filenames: false
- id: devicetests
name: select GPU tests
entry: env GPU=1 PYTHONPATH="." python3 -m pytest test/test_uops.py test/test_search.py
language: system
always_run: true
pass_filenames: false
- id: tests
name: subset of tests
entry: env PYTHONPATH="." python3 -m pytest -n=4 test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py

View File

@@ -6,7 +6,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
from tinygrad.runtime.autogen import libc
from tinygrad.runtime.support.system import PCIIfaceBase
from tinygrad.engine.realize import get_runner, CompiledRunner, get_program
from tinygrad.codegen.opt.kernel import Opt, OptOps
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad import Variable
MOCKGPU = getenv("MOCKGPU")

View File

@@ -1,13 +1,11 @@
import unittest, contextlib
import unittest
import numpy as np
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
from tinygrad.helpers import CI, Context, getenv
from tinygrad.engine.realize import run_schedule
from tinygrad.codegen.opt.kernel import Opt, OptOps, Kernel, KernelOptError
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
from tinygrad.codegen.opt.search import get_kernel_actions
from tinygrad.uop.ops import Ops
from tinygrad.codegen import apply_rewrites, rewrites_for_views
class TestArange(unittest.TestCase):
def _get_flops(self, N, opts=None):
@@ -49,28 +47,6 @@ class TestArange(unittest.TestCase):
@unittest.skip("doesn't work yet")
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.PADTO, axis=1, arg=32)])
def test_all_opts(self, opts=None, exclude=None):
k = Kernel(apply_rewrites(Tensor.arange(256).schedule()[-1].ast, rewrites_for_views))
if opts is not None:
for o in opts: k.apply_opt(o)
all_opts_256 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
k = Kernel(apply_rewrites(Tensor.arange(2560).schedule()[-1].ast, rewrites_for_views))
if opts is not None:
for o in opts: k.apply_opt(o)
all_opts_2560 = [kk.applied_opts for kk in get_kernel_actions(k, include_0=False).values()]
all_opts = [x for x in all_opts_256 if x in all_opts_2560]
for opts in all_opts:
if exclude is not None and opts[-1] in exclude: continue
print(opts)
self.test_complexity(opts)
def test_all_opts_w_local(self):
with contextlib.suppress(KernelOptError):
return self.test_all_opts([Opt(OptOps.LOCAL, 0, 16)], [Opt(op=OptOps.PADTO, axis=1, arg=32)])
def test_all_opts_w_upcast(self): return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4)])
def test_all_opts_w_unroll(self): return self.test_all_opts([Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
def test_all_opts_w_upcast_and_unroll(self):
return self.test_all_opts([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], [Opt(op=OptOps.GROUP, axis=0, arg=0)])
class TestRand(unittest.TestCase):
def test_fused_rand_less_ops(self, noopt=1):
GlobalCounters.reset()

View File

@@ -1,165 +0,0 @@
# ruff: noqa: E501
import unittest
from tinygrad import dtypes
from tinygrad.codegen.opt.kernel import Kernel
from tinygrad.codegen.opt.search import Opt, OptOps, bufs_from_lin
from extra.optimization.helpers import time_linearizer
# stuff needed to unpack a kernel
from tinygrad.uop.ops import UOp, Ops
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View
def _test_overflow(ast, opts):
lin = Kernel(ast)
lin.apply_opts(opts)
bufs = bufs_from_lin(lin)
print(bufs)
time_linearizer(lin, bufs)
# NOTE: if you want these to trigger, set launch bounds on HIP kernels
@unittest.skip("unneeded without launch bounds")
class TestLinearizerOverflow(unittest.TestCase):
def test_overflow_1(self):
ast = UOp(Ops.SINK, None, arg=None, src=(
UOp(Ops.STORE, None, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(51380224), arg=0, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(802816, 0, 12544, 112, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(Ops.MAX, dtypes.float, arg=None, src=(
UOp(Ops.ADD, dtypes.float, arg=None, src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.ADD, dtypes.float, arg=None, src=(
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9633792), arg=1, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 64, 1, 3, 8, 230, 8, 230), strides=(0, 150528, 0, 50176, 0, 224, 0, 1), offset=-675, mask=((0, 1), (0, 64), (0, 1), (0, 3), (0, 8), (3, 227), (0, 8), (3, 227)), contiguous=False), View(shape=(64, 1, 64, 112, 112, 3, 7, 7), strides=(10156800, 0, 0, 3680, 2, 3385600, 425040, 231), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9408), arg=2, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 3, 7, 7), strides=(0, 0, 147, 0, 0, 49, 7, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),
x16:=UOp(Ops.CONST, dtypes.float, arg=0.0, src=(
x17:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 0, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=3, src=()),
x20:=UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(64, 1, 64, 112, 112, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),
UOp(Ops.SQRT, dtypes.float, arg=None, src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
x23:=UOp(Ops.CONST, dtypes.float, arg=1.0, src=(
x17,)),
UOp(Ops.RECIP, dtypes.float, arg=None, src=(
UOp(Ops.ADD, dtypes.float, arg=None, src=(
x23,
UOp(Ops.CONST, dtypes.float, arg=1e-05, src=(
x17,)),)),)),)),)),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(64), arg=4, src=()),
x20,)),)),
x16,)),)),))
opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.LOCAL, axis=2, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=0)]
_test_overflow(ast, opts)
# From BEAM on hlb_cifar.py
def test_overflow_2(self):
ast = UOp(Ops.SINK, None, arg=None, src=(
UOp(Ops.STORE, None, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=0, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(512, 1, 64, 32, 32, 1, 1, 1), strides=(65536, 0, 1024, 32, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(16777216), arg=1, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 512, 1, 32, 4, 34, 4, 34), strides=(0, 32768, 0, 1024, 0, 32, 0, 1), offset=-33, mask=((0, 1), (0, 512), (0, 1), (0, 32), (0, 4), (1, 33), (0, 4), (1, 33)), contiguous=False), View(shape=(512, 1, 64, 32, 32, 32, 3, 3), strides=(591872, 0, 0, 136, 1, 18496, 4760, 35), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(18432), arg=2, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(512, 1, 64, 32, 32, 32, 3, 3), strides=(0, 0, 288, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),))
opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.LOCAL, axis=2, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=2, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UNROLL, axis=0, arg=0)]
_test_overflow(ast, opts)
# from BEAM on default simple_conv.py (which is quite large):
def test_overflow_3(self):
ast = UOp(Ops.SINK, None, arg=None, src=(
UOp(Ops.STORE, None, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=0, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(16, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=1, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 16, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 16), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(16, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(16, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),))
opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.LOCAL, axis=2, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=2)]
_test_overflow(ast, opts)
# from BEAM on BS=4 simple_conv.py:
def test_overflow_4(self):
ast = UOp(Ops.SINK, None, arg=None, src=(
UOp(Ops.STORE, None, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(8388608), arg=0, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(4, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(8388608), arg=1, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 4, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 4), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(4, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(4, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),))
opts = [Opt(op=OptOps.UPCAST, axis=3, arg=4), Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=4), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=4)]
_test_overflow(ast, opts)
# from BEAM on BS=2 simple_conv.py:
def test_overflow_5(self):
ast = UOp(Ops.SINK, None, arg=None, src=(
UOp(Ops.STORE, None, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4194304), arg=0, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(2, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(4194304), arg=1, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 2, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 2), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(2, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(2, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),))
opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.LOCAL, axis=2, arg=2), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=2)]
_test_overflow(ast, opts)
# from BEAM on BS=3 simple_conv.py:
def test_overflow_6(self):
ast = UOp(Ops.SINK, None, arg=None, src=(
UOp(Ops.STORE, None, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=0, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=1, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 3, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 3), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),))
opts = [Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=3, arg=0), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=3, arg=2)]
_test_overflow(ast, opts)
# from BEAM on BS=3 simple_conv.py: (alt)
def test_overflow_7(self):
ast = UOp(Ops.SINK, None, arg=None, src=(
UOp(Ops.STORE, None, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=0, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 1, 1, 1), strides=(2097152, 0, 16384, 128, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=()),
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (7, 6, 5)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(6291456), arg=1, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(1, 3, 1, 128, 4, 130, 4, 130), strides=(0, 2097152, 0, 16384, 0, 128, 0, 1), offset=-129, mask=((0, 1), (0, 3), (0, 1), (0, 128), (0, 4), (1, 129), (0, 4), (1, 129)), contiguous=False), View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(34611200, 0, 0, 520, 1, 270400, 68120, 131), offset=0, mask=None, contiguous=False))), src=()),)),
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(147456), arg=2, src=()),
UOp(Ops.VIEW, None, arg=ShapeTracker(views=(View(shape=(3, 1, 128, 128, 128, 128, 3, 3), strides=(0, 0, 1152, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)),))
opts = [Opt(op=OptOps.UPCAST, axis=3, arg=4), Opt(op=OptOps.LOCAL, axis=3, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=4), Opt(op=OptOps.LOCAL, axis=2, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=2), Opt(op=OptOps.UPCAST, axis=2, arg=4)]
_test_overflow(ast, opts)
if __name__ == '__main__':
unittest.main()

View File

@@ -2,7 +2,7 @@ import numpy as np
import unittest
from tinygrad import Tensor
from tinygrad.helpers import get_single_element
from tinygrad.codegen.opt.kernel import Opt, OptOps
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
class TestOptGemm(unittest.TestCase):

View File

@@ -1,7 +1,7 @@
import unittest
from tinygrad import Tensor, Device
from tinygrad.helpers import RANGEIFY
from tinygrad.codegen.opt.kernel import Opt, OptOps
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.engine.realize import get_program
@unittest.skipIf(RANGEIFY>0, "arg is partial contig in rangeify")

View File

@@ -3,11 +3,9 @@ import numpy as np
import unittest
from dataclasses import replace
from tinygrad import Tensor, Context, Device, dtypes
from tinygrad.uop.ops import Ops, UOp # noqa: F401 # pylint: disable=unused-import
from tinygrad.codegen.opt.kernel import Kernel, Opt, OptOps
from tinygrad.uop.ops import Ops
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.engine.realize import CompiledRunner, ExecItem, lower_schedule_item, get_program
from tinygrad.codegen.opt.search import bufs_from_lin
from tinygrad.shape.shapetracker import ShapeTracker, View # noqa: F401 # pylint: disable=unused-import
N = 512
@@ -236,129 +234,5 @@ class TestQuantizeOnnx(unittest.TestCase):
opts = [Opt(op=OptOps.UPCAST, axis=0, arg=128), Opt(op=OptOps.UNROLL, axis=0, arg=4)]
sexec(out, opts)
@unittest.skipIf(Device.DEFAULT != "DSP", "only tests for DSP")
class TestDSPCache(unittest.TestCase):
def test_cache_speed(self):
# string becuase this breaks Python language server for syntax highlight for some reason
ast = eval("""UOp(Ops.SINK, dtypes.void, arg=None, src=(
UOp(Ops.STORE, dtypes.void, arg=None, src=(
UOp(Ops.VIEW, dtypes.uchar.ptr(25088), arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 896, 32, 1, 0), offset=0, mask=None, contiguous=True),)), src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(25088), arg=0, src=()),)),
UOp(Ops.CAST, dtypes.uchar, arg=None, src=(
UOp(Ops.XOR, dtypes.int, arg=None, src=(
UOp(Ops.MAX, dtypes.int, arg=None, src=(
UOp(Ops.XOR, dtypes.int, arg=None, src=(
UOp(Ops.MAX, dtypes.int, arg=None, src=(
UOp(Ops.CAST, dtypes.int, arg=None, src=(
UOp(Ops.ADD, dtypes.float, arg=None, src=(
UOp(Ops.ADD, dtypes.float, arg=None, src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.ADD, dtypes.float, arg=None, src=(
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4,)), src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.CAST, dtypes.float, arg=None, src=(
UOp(Ops.CAST, dtypes.int, arg=None, src=(
UOp(Ops.LOAD, dtypes.uchar, arg=None, src=(
UOp(Ops.VIEW, dtypes.uchar.ptr(150528), arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 5376, 192, 0, 1), offset=0, mask=None, contiguous=False),)), src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(150528), arg=1, src=()),)),)),)),)),
UOp(Ops.CONST, dtypes.float, arg=0.012368360534310341, src=(
x22:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.CAST, dtypes.float, arg=None, src=(
UOp(Ops.CAST, dtypes.int, arg=None, src=(
UOp(Ops.LOAD, dtypes.char, arg=None, src=(
UOp(Ops.VIEW, dtypes.char.ptr(6144), arg=ShapeTracker(views=(View(shape=(32, 48, 4), strides=(4, 128, 1), offset=0, mask=None, contiguous=False), View(shape=(1, 28, 28, 32, 192), strides=(0, 0, 0, 192, 1), offset=0, mask=None, contiguous=False))), src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.char.ptr(6144), arg=2, src=()),)),)),)),)),
UOp(Ops.CONST, dtypes.float, arg=0.007441135589033365, src=(
x22,)),)),)),)),
UOp(Ops.MUL, dtypes.float, arg=None, src=(
UOp(Ops.CAST, dtypes.float, arg=None, src=(
UOp(Ops.LOAD, dtypes.int, arg=None, src=(
UOp(Ops.VIEW, dtypes.int.ptr(32), arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 1, 0), offset=0, mask=None, contiguous=False),)), src=(
UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(32), arg=3, src=()),)),)),)),
UOp(Ops.CONST, dtypes.float, arg=9.203465015161783e-05, src=(
x36:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1, 28, 28, 32, 1), strides=(0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),
UOp(Ops.CONST, dtypes.float, arg=33.812857328652136, src=(
x36,)),)),
UOp(Ops.CONST, dtypes.float, arg=0.4999999, src=(
x36,)),)),
UOp(Ops.CONST, dtypes.float, arg=136.0, src=(
x36,)),)),)),
UOp(Ops.CONST, dtypes.int, arg=0, src=(
x36,)),)),
x41:=UOp(Ops.CONST, dtypes.int, arg=-1, src=(
x36,)),)),
UOp(Ops.CONST, dtypes.int, arg=-256, src=(
x36,)),)),
x41,)),)),)),))""")
opts = [Opt(op=OptOps.UNROLL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=1, arg=32), Opt(op=OptOps.UPCAST, axis=0, arg=4)]
with Context(DEVECTORIZE=0, QUANTIZE=1):
prg = get_program(ast, opts=opts)
new_src = """
typedef int int32 __attribute__((aligned(128),vector_size(128)));
typedef signed char signed_char128 __attribute__((aligned(128),vector_size(128)));
typedef unsigned char unsigned_char8 __attribute__((aligned(8),vector_size(8)));
typedef unsigned char unsigned_char4 __attribute__((aligned(4),vector_size(4)));
typedef unsigned char unsigned_char128 __attribute__((aligned(128),vector_size(128)));
__attribute__((noinline)) void r_196_32_4_24_8(unsigned char* restrict __attribute__((align_value(128))) data0, unsigned char* restrict __attribute__((align_value(128))) data1, signed char* restrict __attribute__((align_value(
128))) data2, int* restrict __attribute__((align_value(128))) data3) {
int32 cast0 = (int32){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int32 val0 = *((int32*)((data3+0)));
for (int ridx0 = 0; ridx0 < 196; ridx0++) {
int32 acc0 = cast0;
int32 acc1 = cast0;
int32 acc2 = cast0;
int32 acc3 = cast0;
__builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768);
__builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+192);
__builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+384);
__builtin_HEXAGON_Y2_dcfetch(data1+ridx0*768+576);
for (int ridx1 = 0; ridx1 < 24; ridx1++) {
signed_char128 val1 = *((signed_char128*)((data2+(ridx1<<8))));
signed_char128 val2 = *((signed_char128*)((data2+((1+(ridx1<<1))<<7))));
int alu0 = ((ridx0*768)+(ridx1<<3));
unsigned_char8 val3 = *((unsigned_char8*)((data1+alu0)));
__builtin_HEXAGON_Y2_dcfetch(((data1+alu0)+16));
unsigned_char8 val4 = *((unsigned_char8*)((data1+(alu0+192))));
__builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+192))+16));
unsigned_char8 val5 = *((unsigned_char8*)((data1+(alu0+384))));
__builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+384))+16));
unsigned_char8 val6 = *((unsigned_char8*)((data1+(alu0+576))));
__builtin_HEXAGON_Y2_dcfetch(((data1+(alu0+576))+16));
unsigned_char4 alu5 = __builtin_shufflevector(val3, val3, 0, 1, 2, 3);
unsigned_char4 alu6 = __builtin_shufflevector(val4, val4, 0, 1, 2, 3);
unsigned_char4 alu7 = __builtin_shufflevector(val5, val5, 0, 1, 2, 3);
unsigned_char4 alu8 = __builtin_shufflevector(val6, val6, 0, 1, 2, 3);
acc0 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc0, val1, (*((unsigned int*)&alu5)));
acc1 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc1, val1, (*((unsigned int*)&alu6)));
acc2 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc2, val1, (*((unsigned int*)&alu7)));
acc3 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc3, val1, (*((unsigned int*)&alu8)));
unsigned_char4 alu9 = __builtin_shufflevector(val3, val3, 4, 5, 6, 7);
unsigned_char4 alu10 = __builtin_shufflevector(val4, val4, 4, 5, 6, 7);
unsigned_char4 alu11 = __builtin_shufflevector(val5, val5, 4, 5, 6, 7);
unsigned_char4 alu12 = __builtin_shufflevector(val6, val6, 4, 5, 6, 7);
acc0 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc0, val2, (*((unsigned int*)&alu9)));
acc1 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc1, val2, (*((unsigned int*)&alu10)));
acc2 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc2, val2, (*((unsigned int*)&alu11)));
acc3 = __builtin_HEXAGON_V6_vrmpybus_acc_128B(acc3, val2, (*((unsigned int*)&alu12)));
}
unsigned_char128 alu18 = __builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B((((((acc3+val0)*203)+32767)/65536)+136), (((((acc2+val0)*203)+32767)/65536)+136)), __builtin_HEXAGON_V6_vpackwh_sat_128B((((((acc1+val0)*203)+32767)/65536)+136), (((((acc0+val0)*203)+32767)/65536)+136)));
*((unsigned_char128*)((data0+(ridx0<<7)))) = alu18;
}
}
"""
prg = replace(prg, src=new_src+prg.src.split("/* DSP boilerplate */ ")[1])
rt = CompiledRunner(prg)
#Device.default.compiler.disassemble(rt.lib)
ei = ExecItem(rt, bufs_from_lin(Kernel(ast)))
tm = ei.run(wait=True)
print(f"final time {tm*1e6:.2f} us")
if __name__ == "__main__":
unittest.main()

View File

@@ -1,79 +0,0 @@
import unittest
from tinygrad.codegen.opt.kernel import Opt, OptOps, Kernel
from tinygrad.codegen.opt.search import bufs_from_lin, actions, beam_search
from tinygrad.tensor import Tensor
from tinygrad.helpers import Context, GlobalCounters
from tinygrad.engine.realize import capturing
class TestBEAM(unittest.TestCase):
def test_dynamic_beam(self):
# TODO: make this infra globally usable
class Capture:
def __init__(self): self.captured = []
def add(self, x): self.captured.append(x)
capturing.append(Capture())
kernel_count = GlobalCounters.kernel_count
with Context(BEAM=1): Tensor.zeros(16).contiguous().realize()
assert GlobalCounters.kernel_count == kernel_count + 1
k_beam_1 = capturing[0].captured
capturing.clear()
capturing.append(Capture())
kernel_count = GlobalCounters.kernel_count
with Context(BEAM=0): Tensor.zeros(16).contiguous().realize()
assert GlobalCounters.kernel_count == kernel_count + 1
k_beam_0 = capturing[0].captured
capturing.clear()
self.assertNotEqual(k_beam_0[-1].prg.p.src, k_beam_1[-1].prg.p.src)
def test_get_kernel_actions_dedup(self):
from test.test_linearizer import helper_realized_ast
from tinygrad.codegen.opt.search import get_kernel_actions
a = Tensor.empty(4, 3)
b = Tensor.empty(3)
realized_ast, _ = helper_realized_ast(a @ b)
candidates = [
Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.UPCAST, axis=0, arg=4),
Opt(op=OptOps.LOCAL, axis=0, arg=0), Opt(op=OptOps.LOCAL, axis=0, arg=4),
Opt(op=OptOps.UNROLL, axis=0, arg=0), Opt(op=OptOps.UNROLL, axis=0, arg=3),
Opt(op=OptOps.GROUP, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=3),
Opt(op=OptOps.GROUPTOP, axis=0, arg=0), Opt(op=OptOps.GROUPTOP, axis=0, arg=3),
]
lins = get_kernel_actions(Kernel(realized_ast), include_0=False, candidates=candidates).values()
# ensure amt=0 are not duplicated
assert all(len(x.applied_opts) == 1 for x in lins)
kernel_actions = [x.applied_opts[0] for x in lins]
assert Opt(OptOps.UPCAST, axis=0, arg=4) not in kernel_actions, "did not de-dup UPCAST"
assert Opt(OptOps.LOCAL, axis=0, arg=4) not in kernel_actions, "did not de-dup LOCAL"
assert Opt(OptOps.UNROLL, axis=0, arg=3) not in kernel_actions, "did not de-dup UNROLL"
assert Opt(OptOps.GROUP, axis=0, arg=3) not in kernel_actions, "did not de-dup GROUP"
assert Opt(OptOps.GROUPTOP, axis=0, arg=3) not in kernel_actions, "did not de-dup GROUPTOP"
def test_get_kernel_actions_preserves_actions_state(self):
from test.test_linearizer import helper_realized_ast
from tinygrad.codegen.opt.search import get_kernel_actions
a = Tensor.rand(16, 16)
b = Tensor.rand(16, 16)
realized_ast, _ = helper_realized_ast(a @ b)
actions_before = actions.copy()
get_kernel_actions(Kernel(realized_ast))
actions_after = actions.copy()
assert actions_after == actions_before, "actions state was not preserved"
def test_beam_unnamed_kernels(self):
from test.test_linearizer import push_views
a = Tensor.rand(100)
b = Tensor.rand(100)
si = (a+b).schedule()[-1]
lin = Kernel(push_views(si.ast))
bufs = bufs_from_lin(lin)
# TODO: beam should have better instrumentation so we don't have to check this indirect thing
kcount = len(Kernel.kernel_cnt)
beam_search(lin, bufs, 3, disable_cache=True)
self.assertEqual(kcount, len(Kernel.kernel_cnt))
if __name__ == '__main__':
unittest.main()

View File

@@ -14,7 +14,7 @@ from tinygrad.engine.realize import CompiledRunner, get_program
from tinygrad.codegen import full_rewrite
from tinygrad.uop.symbolic import sym
from tinygrad.device import is_dtype_supported
from tinygrad.codegen.opt.kernel import Opt, OptOps
from tinygrad.codegen.opt import Opt, OptOps
def to_uops_list(u:list[UOp], opts=None, skip_check=False) -> list[UOp]: return full_rewrite(UOp.sink(*u), opts)

View File

@@ -6,7 +6,7 @@ from tinygrad.renderer import Estimates
from tinygrad.codegen import full_rewrite
from tinygrad.uop.ops import Ops, UOp
from tinygrad.dtype import dtypes
from tinygrad.codegen.opt.kernel import Opt, OptOps, KernelOptError
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
from tinygrad.device import Device
def flops_mem(uops, ignore_indexing=False):

View File

@@ -1,8 +1,9 @@
import unittest
import unittest, sys
import numpy as np
from tinygrad import Tensor, GlobalCounters, dtypes, Context, nn
from tinygrad.helpers import CI, Profiling, WINO
@unittest.skipIf(sys.platform.startswith("win"), "flaky on Windows")
class TestWinogradClose(unittest.TestCase):
def test_close(self):
inp = Tensor.rand(1, 16, 16, 16)
@@ -18,6 +19,7 @@ class TestWinogradClose(unittest.TestCase):
test = conv(inp).realize()
np.testing.assert_allclose(cmp.numpy(), test.numpy(), atol=1e-5)
@unittest.skipIf(sys.platform.startswith("win"), "flaky on Windows")
class TestWinograd(unittest.TestCase):
def setUp(self):
self.old = WINO.value