fix strided convs, GPU env var for enet

2026-04-07 03:00:26 -04:00 · 2020-11-07 10:26:37 -08:00
parent ec03eb44bd
commit fbff6ab2e5
5 changed files with 20 additions and 10 deletions
--- a/examples/efficientnet.py
+++ b/examples/efficientnet.py
@@ -2,6 +2,8 @@
 # https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth
 # a rough copy of
 # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
+import os
+GPU = os.getenv("GPU", None) is not None
 import sys
 import io
 import numpy as np
@@ -114,6 +116,8 @@ class EfficientNet:
          mv = eval(mk.replace(".bias", "_bias"))
      vnp = v.numpy().astype(np.float32)
      mv.data[:] = vnp if k != '_fc.weight' else vnp.T
+      if GPU:
+        mv.cuda_()

 if __name__ == "__main__":
  # instantiate my net
@@ -154,7 +158,10 @@ if __name__ == "__main__":
  # run the net
  import time
  st = time.time()
-  out = model.forward(Tensor(img))
+  if GPU:
+    out = model.forward(Tensor(img).cuda())
+  else:
+    out = model.forward(Tensor(img))

  # if you want to look at the outputs
  """
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -77,10 +77,10 @@ class TestOps(unittest.TestCase):
    H,W = 3,3
    helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
      lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(),
-      lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu)
+      lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu, forward_only=self.gpu)
    helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
      lambda x,w: torch.nn.functional.conv2d(x,w,stride=(2,1)).relu(),
-      lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu)
+      lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu, forward_only=self.gpu)

  def test_maxpool2x2(self):
    helper_test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d, gpu=self.gpu)
--- a/tinygrad/nn.py
+++ b/tinygrad/nn.py
@@ -12,7 +12,7 @@ class BatchNorm2D:
    # TODO: need running_mean and running_var
    self.running_mean = Tensor.zeros(sz)
    self.running_var = Tensor.zeros(sz)
-    self.num_batches_tracked = Tensor.zeros(0)
+    self.num_batches_tracked = Tensor.zeros(1)

  def __call__(self, x):
    # this work at inference?
--- a/tinygrad/opsgpu.py
+++ b/tinygrad/opsgpu.py
@@ -324,11 +324,13 @@ class Conv2D(Function):

    prg = clbuild(ctx.cl_ctx, """
    __kernel void conv(__global const float *input, __global const float *weight, __global float *output,
-      int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix) {
+      int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {

      int B = get_global_id(0);  // range 0-bs
      int Y = get_global_id(1);  // range 0-oy
      int X = get_global_id(2);  // range 0-ox
+      int IY = Y*ys;
+      int IX = X*xs;
      
      // input  = (bs, groups, cin, iy, ix)
      // weight = (groups, rcout, cin, H, W)
@@ -337,10 +339,10 @@ class Conv2D(Function):
        for (int c = 0; c < rcout; c++) {
          float acc = 0.0;
          for (int ci = 0; ci < cin; ci++) {
-            for (int y = Y; y < Y+H; y++) {
-              for (int x = X; x < X+W; x++) {
+            for (int y = IY; y < IY+H; y++) {
+              for (int x = IX; x < IX+W; x++) {
                acc += input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + y*ix + x] * \
-                  weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + (y-Y)*W + (x-X)];
+                  weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + (y-IY)*W + (x-IX)];
              }
            }
          }
@@ -355,7 +357,8 @@ class Conv2D(Function):
      np.int32(H), np.int32(W),
      np.int32(groups), np.int32(rcout), np.int32(cin),
      np.int32(oy), np.int32(ox), 
-      np.int32(iy), np.int32(ix)
+      np.int32(iy), np.int32(ix),
+      np.int32(ys), np.int32(xs)
    )
    return ret

--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -123,7 +123,7 @@ class Tensor:
    if not self.gpu:
      require_init_gpu()
      assert self.data.dtype == np.float32   # only float32 on GPU
-      data = cl.Buffer(cl_ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.data)
+      data = cl.Buffer(cl_ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.data.ravel())
      data.shape = self.shape
      data.dtype = self.data.dtype
      ret = Tensor(data)