Files
tinygrad/test/test_net_speed.py
cheeetoo a0965ee198 CI < 5 minutes (#1252)
* models matrix

* fix typo and install gpu deps

* install llvm deps if needed

* fix

* testops with cuda

* remove pip cache since not work

* cuda env

* install cuda deps

* maybe it will work now

* i can't read

* all tests in matrix

* trim down more

* opencl stuff in matrix

* opencl pip cache

* test split

* change cuda test exclusion

* test

* fix cuda maybe

* add models

* add more n=auto

* third thing

* fix bug

* cache pip more

* change name

* update tests

* try again cause why not

* balance

* try again...

* try apt cache for cuda

* try on gpu:

* try cuda again

* update packages step

* replace libz-dev with zlib1g-dev

* only cache cuda

* why error

* fix gpuocelot bug

* apt cache err

* apt cache to slow?

* opt and image in single runner

* add a couple n=autos

* remove test matrix

* try cuda apt cache again

* libz-dev -> zlib1g-dev

* remove -s since not supported by xdist

* the cache takes too long and doesn't work

* combine webgpu and metal tests

* combine imagenet to c and cpu tests

* torch tests with linters

* torch back by itself

* small windows clang test with torch tests

* fix a goofy windows bug

* im dumb

* bro

* clang with linters

* fix pylint error

* linter not work on windows

* try with clang again

* clang and imagenet?

* install deps

* fix

* fix quote

* clang by itself (windows too slow)

* env vars for imagenet

* cache pip for metal and webgpu tests

* try torch with metal and webgpu

* doesn't work, too long

* remove -v

* try -n=logical

* don't use logical

* revert accidental thing

* remove some prints unless CI

* fix print unless CI

* ignore speed tests for slow tests

* clang windows in matrix (ubuntu being tested in imagenet->c test)

* try manual pip cache

* fix windows pip cache path

* all manual pip cache

* fix pip cache dir for macos

* print_ci function in helpers

* CI as variable, no print_ci

* missed one

* cuda tests with docker image

* remove setup-python action for cuda

* python->python3?

* remove -s -v

* try fix pip cache

* maybe fix

* try to fix pip cache

* is this the path?

* maybe cache pip

* try again

* create wheels dir

* ?

* cuda pip deps in dockerfile

* disable pip cache for clang

* image from ghcr instead of docker hub

* why is clang like this

* fast deps

* try use different caches

* remove the fast thing

* try with lighter image

* remove setup python for cuda

* small docker and cuda fast deps

* ignore a few more tests

* cool docker thing (maybe)

* oops

* quotes

* fix docker command

* fix bug

* ignore train efficientnet test

* remove dockerfile (docker stuff takes too long)

* remove docker stuff and normal cuda

* oops

* ignore the tests for cuda

* does this work

* ignore test_train on slow backends

* add space

* llvm ignore same tests as cuda

* nvm

* ignore lr scheduler tests

* get some stats

* fix ignore bug

* remove extra '

* remove and

* ignore test for llvm

* change ignored tests and durationon all backends

* fix

* and -> or

* ignore some more cuda tests

* finally?

* does this fix it

* remove durations=0

* add some more tests to llvm

* make last pytest more readable

* fix

* don't train efficientnet on cpu

* try w/out pip cache

* pip cache seems to be generally better

* pytest file markers

* try apt fast for cuda

* use quick install for apt-fast

* apt-fast not worth

* apt-get to apt

* fix typo

* suppress warnings

* register markers

* disable debug on fuzz tests

* change marker names

* apt update and apt install in one command

* update marker names in test.yml

* webgpu pytest marker
2023-07-23 13:00:56 -07:00

103 lines
2.8 KiB
Python

#!/usr/bin/env python
import time
import cProfile
import pstats
import unittest
import torch
from tinygrad.tensor import Tensor, Device
import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
def start_profile():
import time
pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
pr.enable()
return pr
def stop_profile(pr, sort='cumtime', frac=0.2):
pr.disable()
ps = pstats.Stats(pr)
ps.strip_dirs()
ps.sort_stats(sort)
ps.print_stats(frac)
class TestConvSpeed(unittest.TestCase):
def test_mnist(self):
# https://keras.io/examples/vision/mnist_convnet/
conv = 3
inter_chan, out_chan = 32, 64
# ****** torch baseline *******
torch.backends.mkldnn.enabled = False
conv = 3
inter_chan, out_chan = 32, 64
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
c2d = torch.nn.functional.conv2d
mp = torch.nn.MaxPool2d((2,2))
lsm = torch.nn.LogSoftmax(dim=1)
cnt = 5
fpt, bpt = 0.0, 0.0
for i in range(cnt):
et0 = time.time()
x = torch.randn(128, 1, 28, 28, requires_grad=True)
x = mp(c2d(x,c1).relu())
x = mp(c2d(x,c2).relu())
x = x.reshape(x.shape[0], -1)
out = lsm(x.matmul(l1))
out = out.mean()
et1 = time.time()
out.backward()
et2 = time.time()
fpt += (et1-et0)
bpt += (et2-et1)
fpt_baseline = (fpt*1000/cnt)
bpt_baseline = (bpt*1000/cnt)
print("torch forward pass: %.3f ms" % fpt_baseline)
print("torch backward pass: %.3f ms" % bpt_baseline)
# ****** tinygrad compare *******
c1 = Tensor(c1.detach().numpy(), requires_grad=True)
c2 = Tensor(c2.detach().numpy(), requires_grad=True)
l1 = Tensor(l1.detach().numpy(), requires_grad=True)
cnt = 5
fpt, bpt = 0.0, 0.0
for i in range(1+cnt):
et0 = time.time()
x = Tensor.randn(128, 1, 28, 28)
x = x.conv2d(c1).relu().avg_pool2d()
x = x.conv2d(c2).relu().max_pool2d()
x = x.reshape(shape=(x.shape[0], -1))
out = x.dot(l1).log_softmax()
out = out.mean()
out.realize()
et1 = time.time()
out.backward()
[x.grad.realize() for x in [c1, c2, l1]]
et2 = time.time()
if i == 0:
pr = start_profile()
else:
fpt += (et1-et0)
bpt += (et2-et1)
stop_profile(pr, sort='time')
fpt = (fpt*1000/cnt)
bpt = (bpt*1000/cnt)
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
if __name__ == '__main__':
unittest.main()