mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-10 06:35:33 -05:00
* models matrix * fix typo and install gpu deps * install llvm deps if needed * fix * testops with cuda * remove pip cache since not work * cuda env * install cuda deps * maybe it will work now * i can't read * all tests in matrix * trim down more * opencl stuff in matrix * opencl pip cache * test split * change cuda test exclusion * test * fix cuda maybe * add models * add more n=auto * third thing * fix bug * cache pip more * change name * update tests * try again cause why not * balance * try again... * try apt cache for cuda * try on gpu: * try cuda again * update packages step * replace libz-dev with zlib1g-dev * only cache cuda * why error * fix gpuocelot bug * apt cache err * apt cache to slow? * opt and image in single runner * add a couple n=autos * remove test matrix * try cuda apt cache again * libz-dev -> zlib1g-dev * remove -s since not supported by xdist * the cache takes too long and doesn't work * combine webgpu and metal tests * combine imagenet to c and cpu tests * torch tests with linters * torch back by itself * small windows clang test with torch tests * fix a goofy windows bug * im dumb * bro * clang with linters * fix pylint error * linter not work on windows * try with clang again * clang and imagenet? * install deps * fix * fix quote * clang by itself (windows too slow) * env vars for imagenet * cache pip for metal and webgpu tests * try torch with metal and webgpu * doesn't work, too long * remove -v * try -n=logical * don't use logical * revert accidental thing * remove some prints unless CI * fix print unless CI * ignore speed tests for slow tests * clang windows in matrix (ubuntu being tested in imagenet->c test) * try manual pip cache * fix windows pip cache path * all manual pip cache * fix pip cache dir for macos * print_ci function in helpers * CI as variable, no print_ci * missed one * cuda tests with docker image * remove setup-python action for cuda * python->python3? * remove -s -v * try fix pip cache * maybe fix * try to fix pip cache * is this the path? * maybe cache pip * try again * create wheels dir * ? * cuda pip deps in dockerfile * disable pip cache for clang * image from ghcr instead of docker hub * why is clang like this * fast deps * try use different caches * remove the fast thing * try with lighter image * remove setup python for cuda * small docker and cuda fast deps * ignore a few more tests * cool docker thing (maybe) * oops * quotes * fix docker command * fix bug * ignore train efficientnet test * remove dockerfile (docker stuff takes too long) * remove docker stuff and normal cuda * oops * ignore the tests for cuda * does this work * ignore test_train on slow backends * add space * llvm ignore same tests as cuda * nvm * ignore lr scheduler tests * get some stats * fix ignore bug * remove extra ' * remove and * ignore test for llvm * change ignored tests and durationon all backends * fix * and -> or * ignore some more cuda tests * finally? * does this fix it * remove durations=0 * add some more tests to llvm * make last pytest more readable * fix * don't train efficientnet on cpu * try w/out pip cache * pip cache seems to be generally better * pytest file markers * try apt fast for cuda * use quick install for apt-fast * apt-fast not worth * apt-get to apt * fix typo * suppress warnings * register markers * disable debug on fuzz tests * change marker names * apt update and apt install in one command * update marker names in test.yml * webgpu pytest marker
103 lines
2.8 KiB
Python
103 lines
2.8 KiB
Python
#!/usr/bin/env python
|
|
import time
|
|
import cProfile
|
|
import pstats
|
|
import unittest
|
|
import torch
|
|
from tinygrad.tensor import Tensor, Device
|
|
import pytest
|
|
|
|
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
|
|
|
|
def start_profile():
|
|
import time
|
|
pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
|
|
pr.enable()
|
|
return pr
|
|
|
|
def stop_profile(pr, sort='cumtime', frac=0.2):
|
|
pr.disable()
|
|
ps = pstats.Stats(pr)
|
|
ps.strip_dirs()
|
|
ps.sort_stats(sort)
|
|
ps.print_stats(frac)
|
|
|
|
class TestConvSpeed(unittest.TestCase):
|
|
|
|
def test_mnist(self):
|
|
# https://keras.io/examples/vision/mnist_convnet/
|
|
conv = 3
|
|
inter_chan, out_chan = 32, 64
|
|
|
|
# ****** torch baseline *******
|
|
|
|
torch.backends.mkldnn.enabled = False
|
|
|
|
conv = 3
|
|
inter_chan, out_chan = 32, 64
|
|
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
|
|
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
|
|
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
|
|
|
|
c2d = torch.nn.functional.conv2d
|
|
mp = torch.nn.MaxPool2d((2,2))
|
|
lsm = torch.nn.LogSoftmax(dim=1)
|
|
|
|
cnt = 5
|
|
fpt, bpt = 0.0, 0.0
|
|
for i in range(cnt):
|
|
et0 = time.time()
|
|
x = torch.randn(128, 1, 28, 28, requires_grad=True)
|
|
x = mp(c2d(x,c1).relu())
|
|
x = mp(c2d(x,c2).relu())
|
|
x = x.reshape(x.shape[0], -1)
|
|
out = lsm(x.matmul(l1))
|
|
out = out.mean()
|
|
et1 = time.time()
|
|
out.backward()
|
|
et2 = time.time()
|
|
fpt += (et1-et0)
|
|
bpt += (et2-et1)
|
|
|
|
fpt_baseline = (fpt*1000/cnt)
|
|
bpt_baseline = (bpt*1000/cnt)
|
|
print("torch forward pass: %.3f ms" % fpt_baseline)
|
|
print("torch backward pass: %.3f ms" % bpt_baseline)
|
|
|
|
# ****** tinygrad compare *******
|
|
|
|
c1 = Tensor(c1.detach().numpy(), requires_grad=True)
|
|
c2 = Tensor(c2.detach().numpy(), requires_grad=True)
|
|
l1 = Tensor(l1.detach().numpy(), requires_grad=True)
|
|
|
|
cnt = 5
|
|
fpt, bpt = 0.0, 0.0
|
|
for i in range(1+cnt):
|
|
et0 = time.time()
|
|
x = Tensor.randn(128, 1, 28, 28)
|
|
x = x.conv2d(c1).relu().avg_pool2d()
|
|
x = x.conv2d(c2).relu().max_pool2d()
|
|
x = x.reshape(shape=(x.shape[0], -1))
|
|
out = x.dot(l1).log_softmax()
|
|
out = out.mean()
|
|
out.realize()
|
|
et1 = time.time()
|
|
out.backward()
|
|
[x.grad.realize() for x in [c1, c2, l1]]
|
|
et2 = time.time()
|
|
if i == 0:
|
|
pr = start_profile()
|
|
else:
|
|
fpt += (et1-et0)
|
|
bpt += (et2-et1)
|
|
|
|
stop_profile(pr, sort='time')
|
|
fpt = (fpt*1000/cnt)
|
|
bpt = (bpt*1000/cnt)
|
|
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
|
|
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|