mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-10 22:54:59 -05:00
* new uops is an actual graph
* it's way slower
* simpler
* fix define acc
* render_loop unique
* ops test pass
* add pattern matcher back, there's bugs
* rewrite
* use priority queue
* recursive children
* fix tests
* fix tests with SINK
* fix abstractions
* fix assembly
* simpler
* link define_acc
* fix DEFINE_ACC placement
* type verify
* full cmp
* fix cmp
* ACCESS_ACC
* insert DEFINE_ACC
* fix PHI
* recursive rewrite
* fix many tests
* sum collapse
* more patterns
* correct change
* fold arange
* fix that lin test
* space
* big folding rule works
* close
* has more maxes, meh
* cached node replace
* set changed
* simplest folding yet
* works
* works
* DIV
* all tests pass
* del
* fuzz linearizer fails
* sum_collapse
* test depth 2 cf
* fix lin test 14
* fix clang depth
* disable that
* failure 14 is fixed
* fix ptx
* failure 27 is fixed
* fix llama
* run_cnt
* Revert "Optimize PTX gated loads index calculation (#4304)"
This reverts commit d97d5a7689.
* fix uops loop
* fix ptx bugs
* add barrier
* print
* mem_type in ptx direct
* bypass tests that fail in CI but pass locally
* ptx remove ptr_ar
* more ptx passing
* fix ptx tests
* assert compile support
* remove model inference benchmark from red
72 lines
2.5 KiB
Python
72 lines
2.5 KiB
Python
import unittest
|
|
from tinygrad import Tensor, GlobalCounters
|
|
from tinygrad.helpers import Timing, CI, Profiling, WINO, DEBUG
|
|
from tinygrad.ops import LoadOps
|
|
from tinygrad.codegen.linearizer import Linearizer
|
|
from tinygrad.engine.schedule import create_schedule
|
|
|
|
class TestWinograd(unittest.TestCase):
|
|
def setUp(self):
|
|
self.old = WINO.value
|
|
WINO.value = 1
|
|
def tearDown(self):
|
|
WINO.value = self.old
|
|
|
|
def test_speed(self):
|
|
x = Tensor.empty(1,4,9,9)
|
|
w = Tensor.empty(4,4,3,3)
|
|
|
|
with Timing("running conv: "):
|
|
out = Tensor.conv2d(x, w)
|
|
|
|
with Timing("scheduling: "):
|
|
sched = create_schedule([out.lazydata])
|
|
|
|
for i,s in enumerate(sched):
|
|
if s.ast[0].op in LoadOps: continue
|
|
ops = [out.lazyops for out in s.ast]
|
|
with Timing(f"linearize {i} with {len(ops):4d} ops: "):
|
|
l = Linearizer(*s.ast)
|
|
l.hand_coded_optimizations()
|
|
l.linearize()
|
|
assert len(l.sts) <= 256 # just the current value to prevent regression
|
|
if DEBUG >= 2: print(f"{len(l.sts):4d} shapetrackers with max {max(len(x.views) for x in l.sts)} views")
|
|
for st in l.sts:
|
|
assert len(st.views) <= 2, "too many views in winograd"
|
|
if DEBUG >= 3:
|
|
print(f"{len(st.views):3d} views")
|
|
for v in st.views: print(v)
|
|
|
|
def test_profile(self):
|
|
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
|
|
with Profiling(enabled=not CI, sort='time'):
|
|
out = Tensor.conv2d(x,w).realize()
|
|
out.numpy()
|
|
|
|
def test_four_kernels(self):
|
|
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
|
|
GlobalCounters.reset()
|
|
out = Tensor.conv2d(x,w).realize()
|
|
assert GlobalCounters.kernel_count == 4
|
|
out.numpy()
|
|
|
|
def test_counters(self):
|
|
IC, OC, X, Y = 4,4,9,9
|
|
#OC, IC, X, Y = 512, 256, 8, 8
|
|
x,w = Tensor.rand(1,IC,Y,X).realize(), Tensor.rand(OC,IC,3,3).realize()
|
|
GlobalCounters.reset()
|
|
Tensor.conv2d(x,w).realize()
|
|
ops_wino, mem_wino = GlobalCounters.global_ops, GlobalCounters.global_mem
|
|
WINO.value = 0
|
|
GlobalCounters.reset()
|
|
Tensor.conv2d(x,w).realize()
|
|
ops_normal, mem_normal = GlobalCounters.global_ops, GlobalCounters.global_mem
|
|
|
|
ops_ratio, mem_ratio = ops_wino/ops_normal, mem_wino/mem_normal
|
|
print(f"ops: normal {ops_normal:9d} wino {ops_wino:9d} ratio {ops_ratio:.2f}")
|
|
print(f"mem: normal {mem_normal:9d} wino {mem_wino:9d} ratio {mem_ratio:.2f}")
|
|
assert ops_ratio < 2 and mem_ratio < 10
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main(verbosity=2)
|