diff --git a/test/external/external_benchmark_resnet.py b/test/external/external_benchmark_resnet.py index 426f2a4ffb..08fe56e2ce 100644 --- a/test/external/external_benchmark_resnet.py +++ b/test/external/external_benchmark_resnet.py @@ -2,10 +2,11 @@ import functools import time import unittest -from tinygrad import Tensor, TinyJit, GlobalCounters +from tinygrad import Tensor, TinyJit, GlobalCounters, Device from tinygrad.helpers import getenv, Context from tinygrad.nn.optim import SGD from tinygrad.nn.state import get_parameters +from tinygrad.engine.realize import run_schedule from extra.models import resnet from examples.mlperf.initializers import Conv2dHeNormal, Linear @@ -15,8 +16,9 @@ from examples.hlb_cifar10 import UnsyncedBatchNorm # benchmark speed: BEAM=2 JITCNT=10 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py # benchmark only one layer: BEAM=2 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py BenchmarkResnetTrain.test_layer1_2 # inspect: DEBUG=2 BEAM=2 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py -# inspect convs: DEBUG=2 BEAM=2 CONV=1 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py -# inspect convs with batchnorm: DEBUG=2 BEAM=2 CONV=1 BN=1 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py +# inspect 1x1 convs: DEBUG=2 BEAM=2 CONV=2 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py +# inspect 3x3 convs: DEBUG=2 BEAM=2 CONV=2 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py +# inspect 3x3 convs with batchnorm: DEBUG=2 BEAM=2 CONV=2 BN=1 DEFAULT_FLOAT=HALF python test/external/external_benchmark_resnet.py # etc # use ASSIGN=0 to disable batchnorm/optimizer assigns @@ -40,64 +42,86 @@ class BenchmarkResnetTrain(unittest.TestCase): layer = self.layers[layer_i][slice_i] xy = 112 >> layer_i - if layer_i > 0: xy >>= (1 if slice_i > 0 else 0) + xy >>= (1 if slice_i > 0 or layer_i == 0 else 0) # layer 1 is preceded by maxpool2d name = f"layer{layer_i+1} slice{slice_i+1}" - # get specific conv (0 or 1) + # get specific conv if conv: - if bn: f = [layer.conv2, layer.bn2, Tensor.relu] - else: f = [layer.conv2, Tensor.relu] - cin = layer.conv2.in_channels - xy = xy // layer.conv1.stride - return f"{name} conv2 x{str((bs, cin, xy, xy)):20s} k{str(layer.conv2.weight.shape):20s}" + (" bn" if bn else ""), f, cin, xy + convs = [layer.conv1, layer.conv2, layer.conv3] + ([layer.downsample[0]] if layer.downsample else []) + bns = [layer.bn1, layer.bn2, layer.bn3] + ([layer.downsample[1]] if layer.downsample else []) + f = [convs[conv-1]] + if bn: f.append(bns[conv-1]) + f.append(Tensor.relu) + cin = f[0].in_channels + if conv == 3: xy //= convs[1].stride + return f"{name} conv{conv} x{str((bs, cin, xy, xy)):20s} k{str(f[0].weight.shape):20s}" + (" bn" if bn else ""), f, cin, xy cin = layer.conv1.in_channels return f"{name} x{(bs, cin, xy, xy)}", [layer], cin, xy def _test_layer(self, name, layer, cin, xy): optim = SGD(get_parameters(layer), bs / 128 * 1.0) # need sgd for some params but not consequential for benchmarking - with Context(SAVE_SCHEDULE=0): Tensor.realize(*[t.assign(t) for t in get_parameters(layer)]) + with Context(SAVE_SCHEDULE=0): Tensor.realize(*[t.assign(t.detach().contiguous()) for t in get_parameters(optim)]) JITCNT = getenv("JITCNT", 1) Tensor.training = True @TinyJit def step(x): - for _ in range(JITCNT): - optim.zero_grad() - x.grad = None + optim.zero_grad() + x.grad = None - y = x.sequential(layer).contiguous().contiguous_backward() - y.sum().backward() - if getenv("ASSIGN", 1): Tensor.realize(y, x.grad, *optim.schedule_step()) - else: Tensor.realize(y, x.grad, *[t.grad for t in optim.params]) - return y.detach() + y = x.sequential(layer).contiguous().contiguous_backward() + y.sum().backward() + if getenv("ASSIGN", 1): sched, _ = Tensor.schedule_with_vars(y, x.grad, *optim.schedule_step()) + else: sched, _ = Tensor.schedule_with_vars(y, x.grad, *[t.grad for t in optim.params]) + + for _ in range(JITCNT): + run_schedule([si for si in sched]) CNT = getenv("CNT", 5) best_tm = None - flops, mem_used, kernels = None, None, None + flops, mem_used, mem, kernels = None, None, None, None for i in range(CNT): with Context(SAVE_SCHEDULE=0): x = Tensor.randn(bs, cin, xy, xy, requires_grad=True).realize() GlobalCounters.reset() st = time.perf_counter() - out = step(x) - with Context(SAVE_SCHEDULE=0): out._data() + step(x) + Device[Device.DEFAULT].synchronize() et = time.perf_counter() - if flops is None: flops = GlobalCounters.global_ops / JITCNT + flops = GlobalCounters.global_ops / JITCNT mem_used = GlobalCounters.mem_used # a little high with JITCNT > 1 fsr - kernels = GlobalCounters.kernel_count // JITCNT + mem = GlobalCounters.global_mem / JITCNT + if kernels is None: kernels = GlobalCounters.kernel_count // JITCNT tm = (et-st) / JITCNT if best_tm is None or tm < best_tm: best_tm = tm - print(f"\r{name:42s}: {best_tm * 1000:>9.2f} ms, {flops / 10**12 / tm:>7.2f} tflops, {mem_used / 10**9: 7.2f} GB used, {kernels:>6d} kernels") + print(f"\r{name:42s}: {best_tm * 1000:>9.2f} ms, {flops / 10**12 / best_tm:>7.2f} tflops, " + f"{mem_used / 10**9: 7.2f} GB used, {kernels:>6d} kernels") + return best_tm, flops, mem, kernels + + def test_layer1_1(self): self._est(*self._test_layer(*self._get_layer(0, 0)), 1) + def test_layer1_2(self): self._est(*self._test_layer(*self._get_layer(0, 1)), 2) + def test_layer2_1(self): self._est(*self._test_layer(*self._get_layer(1, 0)), 1) + def test_layer2_2(self): self._est(*self._test_layer(*self._get_layer(1, 1)), 3) + def test_layer3_1(self): self._est(*self._test_layer(*self._get_layer(2, 0)), 1) + def test_layer3_2(self): self._est(*self._test_layer(*self._get_layer(2, 1)), 5) + def test_layer4_1(self): self._est(*self._test_layer(*self._get_layer(3, 0)), 1) + def test_layer4_2(self): self._est(*self._test_layer(*self._get_layer(3, 1)), 2) + + est_tm, est_flops, est_mem, est_kernels = 0, 0, 0, 0 + + @classmethod + def _est(cls, tm, flops, mem, kernels, mult): + cls.est_tm += tm * mult + cls.est_flops += flops * mult + cls.est_mem += mem * mult + cls.est_kernels += kernels * mult + + @classmethod + def tearDownClass(cls): + print(f"\restimated step tm: {cls.est_tm * 1000.0:.2f} ms, {cls.est_flops / 10 ** 12 / cls.est_tm:.3f} tflops, " + f"{cls.est_mem / 10 ** 9 / cls.est_tm:.2f} GB/s, {cls.est_kernels} kernels") - def test_layer1_1(self): self._test_layer(*self._get_layer(0, 0)) - def test_layer1_2(self): self._test_layer(*self._get_layer(0, 1)) - def test_layer2_1(self): self._test_layer(*self._get_layer(1, 0)) - def test_layer2_2(self): self._test_layer(*self._get_layer(1, 1)) - def test_layer3_1(self): self._test_layer(*self._get_layer(2, 0)) - def test_layer3_2(self): self._test_layer(*self._get_layer(2, 1)) - def test_layer4_1(self): self._test_layer(*self._get_layer(3, 0)) - def test_layer4_2(self): self._test_layer(*self._get_layer(3, 1)) if __name__ == '__main__': unittest.main()