batchnorm work

2026-04-07 03:00:26 -04:00 · 2020-12-06 14:40:07 -08:00
parent da514c2918
commit 00312b8ad1
5 changed files with 32 additions and 19 deletions
--- a/examples/efficientnet.py
+++ b/examples/efficientnet.py
@@ -10,7 +10,7 @@ import time
 import numpy as np
 np.set_printoptions(suppress=True)
 from tinygrad.tensor import Tensor
-from tinygrad.utils import fetch
+from tinygrad.utils import fetch, get_parameters
 from extra.efficientnet import EfficientNet

 def infer(model, img):
@@ -53,7 +53,9 @@ def infer(model, img):
 if __name__ == "__main__":
  # instantiate my net
  model = EfficientNet(int(os.getenv("NUM", "0")))
-  model.load_weights_from_torch(GPU)
+  model.load_weights_from_torch()
+  if GPU:
+    [x.cuda_() for x in get_parameters(model)]

  # category labels
  import ast
--- a/examples/train_efficientnet.py
+++ b/examples/train_efficientnet.py
@@ -41,6 +41,8 @@ if __name__ == "__main__":
    model = TinyConvNet(classes)
  else:
    model = EfficientNet(int(os.getenv("NUM", "0")), classes, has_se=False)
+    #model = EfficientNet(int(os.getenv("NUM", "0")), classes, has_se=True)
+    #model.load_weights_from_torch()

  parameters = get_parameters(model)
  print("parameters", len(parameters))
@@ -74,13 +76,14 @@ if __name__ == "__main__":
    optimizer.step()
    opt_time = (time.time()-st)*1000.0

+    #print(out.cpu().data)
+
    st = time.time()
    loss = loss.cpu().data
    cat = np.argmax(out.cpu().data, axis=1)
    accuracy = (cat == Y).mean()
    finish_time = (time.time()-st)*1000.0

-
    # printing
    t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f -- %d" %
      (loss, accuracy,
--- a/extra/efficientnet.py
+++ b/extra/efficientnet.py
@@ -180,8 +180,8 @@ class EfficientNet:
  def forward(self, x):
    x = x.pad2d(padding=(0,1,0,1))
    x = self._bn0(x.conv2d(self._conv_stem, stride=2)).swish()
+    #print(x.shape, x.data[:, 0, 0, 0])
    for block in self._blocks:
-      #print(x.shape)
      x = block(x)
    x = self._bn1(x.conv2d(self._conv_head)).swish()
    x = x.avg_pool2d(kernel_size=x.shape[2:4])
@@ -189,7 +189,7 @@ class EfficientNet:
    #x = x.dropout(0.2)
    return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1,-1]))

-  def load_weights_from_torch(self, gpu):
+  def load_weights_from_torch(self):
    # load b0
    # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/utils.py#L551
    if self.number == 0:
@@ -223,7 +223,10 @@ class EfficientNet:
        except AttributeError:
          mv = eval(mk.replace(".bias", "_bias"))
      vnp = v.numpy().astype(np.float32) if USE_TORCH else v
-      mv.data[:] = vnp if k != '_fc.weight' else vnp.T
-      if gpu:
-        mv.cuda_()
+      vnp = vnp if k != '_fc.weight' else vnp.T
+
+      if mv.shape == vnp.shape or vnp.shape == ():
+        mv.data[:] = vnp
+      else:
+        print("MISMATCH SHAPE IN %s, %r %r" % (k, mv.shape, vnp.shape))

--- a/tinygrad/nn.py
+++ b/tinygrad/nn.py
@@ -2,20 +2,25 @@ from tinygrad.tensor import Tensor

 class BatchNorm2D:
  def __init__(self, sz, eps=0.001):
-    self.eps = eps
+    self.eps = Tensor([eps], requires_grad=False)
+    self.two = Tensor([2], requires_grad=False)
    self.weight = Tensor.ones(sz)
    self.bias = Tensor.zeros(sz)

-    # TODO: need running_mean and running_var
-    self.running_mean = Tensor.zeros(sz)
-    self.running_var = Tensor.ones(sz)
+    self.running_mean = Tensor.zeros(sz, requires_grad=False)
+    self.running_var = Tensor.ones(sz, requires_grad=False)
    self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)

  def __call__(self, x):
+    # TODO: use tinyops for this
+    # mean op needs to support the axis argument before we can do this
+    #self.running_mean.data = x.data.mean(axis=(0,2,3))
+    #self.running_var.data = ((x - self.running_mean.reshape(shape=[1, -1, 1, 1]))**self.two).data.mean(axis=(0,2,3))
+
    # this work at inference?
    x = x.sub(self.running_mean.reshape(shape=[1, -1, 1, 1]))
    x = x.mul(self.weight.reshape(shape=[1, -1, 1, 1]))
-    x = x.div(self.running_var.add(Tensor([self.eps], gpu=x.gpu)).reshape(shape=[1, -1, 1, 1]).sqrt())
+    x = x.div(self.running_var.add(self.eps).reshape(shape=[1, -1, 1, 1]).sqrt())
    x = x.add(self.bias.reshape(shape=[1, -1, 1, 1]))
    return x

--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -195,23 +195,23 @@ class Tensor:
  # ***** non first class ops *****

  def mean(self):
-    div = Tensor(np.array([1/np.prod(self.shape)], dtype=self.dtype), gpu=self.gpu)
+    div = Tensor(np.array([1/np.prod(self.shape)], dtype=self.dtype), gpu=self.gpu, requires_grad=False)
    return self.sum().mul(div)

  def sqrt(self):
-    root = Tensor(np.zeros(self.shape, dtype=self.dtype)+0.5, gpu=self.gpu)
+    root = Tensor(np.zeros(self.shape, dtype=self.dtype)+0.5, gpu=self.gpu, requires_grad=False)
    return self.pow(root)

  def div(self, y):
-    root = Tensor(np.zeros(self.shape, dtype=self.dtype)-1, gpu=self.gpu)
+    root = Tensor(np.zeros(self.shape, dtype=self.dtype)-1, gpu=self.gpu, requires_grad=False)
    return self.mul(y.pow(root))

  def swish(self):
    return self.mul(self.sigmoid())

  def tanh(self):
-    t2 = Tensor(np.zeros(self.shape, dtype=self.dtype)+2, gpu=self.gpu)
-    t1 = Tensor(np.zeros(self.shape, dtype=self.dtype)+1, gpu=self.gpu)
+    t2 = Tensor(np.zeros(self.shape, dtype=self.dtype)+2, gpu=self.gpu, requires_grad=False)
+    t1 = Tensor(np.zeros(self.shape, dtype=self.dtype)+1, gpu=self.gpu, requires_grad=False)
    return self.mul(t2).sigmoid().mul(t2) - t1 # 2*sigmoid(2*x)-1

 # An instantiation of the Function is the Context
@@ -251,7 +251,7 @@ def register(name, fxn, gpu=False):
    f.cl_ctx, f.cl_queue = cl_ctx, cl_queue
    return f.apply(f, *x, **kwargs)
  setattr(Tensor, name, dispatch)
-  if name in ['add', 'sub', 'mul', 'div']:
+  if name in ['add', 'sub', 'mul', 'div', 'pow']:
    setattr(Tensor, "__%s__" % name, dispatch)
    setattr(Tensor, "__i%s__" % name, lambda self,x: self.assign(dispatch(self,x)))