fix strided convs, GPU env var for enet

This commit is contained in:
George Hotz
2020-11-07 10:26:37 -08:00
parent ec03eb44bd
commit fbff6ab2e5
5 changed files with 20 additions and 10 deletions

View File

@@ -2,6 +2,8 @@
# https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth
# a rough copy of
# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
import os
GPU = os.getenv("GPU", None) is not None
import sys
import io
import numpy as np
@@ -114,6 +116,8 @@ class EfficientNet:
mv = eval(mk.replace(".bias", "_bias"))
vnp = v.numpy().astype(np.float32)
mv.data[:] = vnp if k != '_fc.weight' else vnp.T
if GPU:
mv.cuda_()
if __name__ == "__main__":
# instantiate my net
@@ -154,7 +158,10 @@ if __name__ == "__main__":
# run the net
import time
st = time.time()
out = model.forward(Tensor(img))
if GPU:
out = model.forward(Tensor(img).cuda())
else:
out = model.forward(Tensor(img))
# if you want to look at the outputs
"""

View File

@@ -77,10 +77,10 @@ class TestOps(unittest.TestCase):
H,W = 3,3
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(),
lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu)
lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu, forward_only=self.gpu)
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
lambda x,w: torch.nn.functional.conv2d(x,w,stride=(2,1)).relu(),
lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu)
lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu, forward_only=self.gpu)
def test_maxpool2x2(self):
helper_test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d, gpu=self.gpu)

View File

@@ -12,7 +12,7 @@ class BatchNorm2D:
# TODO: need running_mean and running_var
self.running_mean = Tensor.zeros(sz)
self.running_var = Tensor.zeros(sz)
self.num_batches_tracked = Tensor.zeros(0)
self.num_batches_tracked = Tensor.zeros(1)
def __call__(self, x):
# this work at inference?

View File

@@ -324,11 +324,13 @@ class Conv2D(Function):
prg = clbuild(ctx.cl_ctx, """
__kernel void conv(__global const float *input, __global const float *weight, __global float *output,
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix) {
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {
int B = get_global_id(0); // range 0-bs
int Y = get_global_id(1); // range 0-oy
int X = get_global_id(2); // range 0-ox
int IY = Y*ys;
int IX = X*xs;
// input = (bs, groups, cin, iy, ix)
// weight = (groups, rcout, cin, H, W)
@@ -337,10 +339,10 @@ class Conv2D(Function):
for (int c = 0; c < rcout; c++) {
float acc = 0.0;
for (int ci = 0; ci < cin; ci++) {
for (int y = Y; y < Y+H; y++) {
for (int x = X; x < X+W; x++) {
for (int y = IY; y < IY+H; y++) {
for (int x = IX; x < IX+W; x++) {
acc += input[B*groups*cin*iy*ix + g*cin*iy*ix + ci*iy*ix + y*ix + x] * \
weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + (y-Y)*W + (x-X)];
weight[g*rcout*cin*H*W + c*cin*H*W + ci*H*W + (y-IY)*W + (x-IX)];
}
}
}
@@ -355,7 +357,8 @@ class Conv2D(Function):
np.int32(H), np.int32(W),
np.int32(groups), np.int32(rcout), np.int32(cin),
np.int32(oy), np.int32(ox),
np.int32(iy), np.int32(ix)
np.int32(iy), np.int32(ix),
np.int32(ys), np.int32(xs)
)
return ret

View File

@@ -123,7 +123,7 @@ class Tensor:
if not self.gpu:
require_init_gpu()
assert self.data.dtype == np.float32 # only float32 on GPU
data = cl.Buffer(cl_ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.data)
data = cl.Buffer(cl_ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.data.ravel())
data.shape = self.shape
data.dtype = self.data.dtype
ret = Tensor(data)