move things, clean up extra (#2292)

* move things

* idk why pylint needs that now

* delete unused
This commit is contained in:
George Hotz
2023-11-13 20:18:40 -08:00
committed by GitHub
parent b1f7f29525
commit 0cbf6c1811
57 changed files with 48 additions and 401 deletions

View File

@@ -39,7 +39,7 @@ repos:
pass_filenames: false
- id: pylint
name: pylint
entry: python3 -m pylint tinygrad/
entry: env PYTHONPATH="." python3 -m pylint tinygrad/
language: system
always_run: true
pass_filenames: false

1
cache/.gitignore vendored
View File

@@ -1 +0,0 @@
*

View File

@@ -2,7 +2,7 @@
import gc
import time
from tqdm import trange
from models.efficientnet import EfficientNet
from extra.models.efficientnet import EfficientNet
from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim
from tinygrad.tensor import Tensor

View File

@@ -1,5 +1,5 @@
from pathlib import Path
from models.efficientnet import EfficientNet
from extra.models.efficientnet import EfficientNet
from tinygrad.tensor import Tensor
from tinygrad.nn.state import safe_save
from extra.utils import fetch

View File

@@ -13,7 +13,7 @@ from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv
from extra.utils import fetch
from tinygrad.jit import TinyJit
from models.efficientnet import EfficientNet
from extra.models.efficientnet import EfficientNet
np.set_printoptions(suppress=True)
# TODO: you should be able to put these in the jitted function

View File

@@ -1,5 +1,5 @@
from typing import List
from models.resnet import ResNet50
from extra.models.resnet import ResNet50
from tinygrad.tensor import Tensor
from tinygrad.ops import LoadOps, Device, Compiled
from tinygrad.codegen.linearizer import Linearizer

View File

@@ -1,6 +1,6 @@
from models.mask_rcnn import MaskRCNN
from models.resnet import ResNet
from models.mask_rcnn import BoxList
from extra.models.mask_rcnn import MaskRCNN
from extra.models.resnet import ResNet
from extra.models.mask_rcnn import BoxList
from torch.nn import functional as F
from torchvision import transforms as T
from torchvision.transforms import functional as Ft

View File

@@ -9,7 +9,7 @@ from examples.mlperf import helpers
def eval_resnet():
# Resnet50-v1.5
from tinygrad.jit import TinyJit
from models.resnet import ResNet50
from extra.models.resnet import ResNet50
mdl = ResNet50()
mdl.load_from_pretrained()
@@ -53,7 +53,7 @@ def eval_resnet():
def eval_unet3d():
# UNet3D
from models.unet3d import UNet3D
from extra.models.unet3d import UNet3D
from extra.datasets.kits19 import iterate, sliding_window_inference
from examples.mlperf.metrics import get_dice_score
mdl = UNet3D()
@@ -71,8 +71,8 @@ def eval_unet3d():
def eval_retinanet():
# RetinaNet with ResNeXt50_32X4D
from models.resnet import ResNeXt50_32X4D
from models.retinanet import RetinaNet
from extra.models.resnet import ResNeXt50_32X4D
from extra.models.retinanet import RetinaNet
mdl = RetinaNet(ResNeXt50_32X4D())
mdl.load_from_pretrained()
@@ -129,7 +129,7 @@ def eval_retinanet():
def eval_rnnt():
# RNN-T
from models.rnnt import RNNT
from extra.models.rnnt import RNNT
mdl = RNNT()
mdl.load_from_pretrained()
@@ -158,7 +158,7 @@ def eval_rnnt():
def eval_bert():
# Bert-QA
from models.bert import BertForQuestionAnswering
from extra.models.bert import BertForQuestionAnswering
mdl = BertForQuestionAnswering()
mdl.load_from_pretrained()
@@ -194,8 +194,8 @@ def eval_bert():
def eval_mrcnn():
from tqdm import tqdm
from models.mask_rcnn import MaskRCNN
from models.resnet import ResNet
from extra.models.mask_rcnn import MaskRCNN
from extra.models.resnet import ResNet
from extra.datasets.coco import BASEDIR, images, convert_prediction_to_coco_bbox, convert_prediction_to_coco_mask, accumulate_predictions_for_coco, evaluate_predictions_on_coco, iterate
from examples.mask_rcnn import compute_prediction_batched, Image
mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True))

View File

@@ -12,29 +12,29 @@ def test_model(model, *inputs):
def spec_resnet():
# Resnet50-v1.5
from models.resnet import ResNet50
from extra.models.resnet import ResNet50
mdl = ResNet50()
img = Tensor.randn(1, 3, 224, 224)
test_model(mdl, img)
def spec_retinanet():
# Retinanet with ResNet backbone
from models.resnet import ResNet50
from models.retinanet import RetinaNet
from extra.models.resnet import ResNet50
from extra.models.retinanet import RetinaNet
mdl = RetinaNet(ResNet50(), num_classes=91, num_anchors=9)
img = Tensor.randn(1, 3, 224, 224)
test_model(mdl, img)
def spec_unet3d():
# 3D UNET
from models.unet3d import UNet3D
from extra.models.unet3d import UNet3D
mdl = UNet3D()
#mdl.load_from_pretrained()
img = Tensor.randn(1, 1, 128, 128, 128)
test_model(mdl, img)
def spec_rnnt():
from models.rnnt import RNNT
from extra.models.rnnt import RNNT
mdl = RNNT()
#mdl.load_from_pretrained()
x = Tensor.randn(220, 1, 240)
@@ -42,7 +42,7 @@ def spec_rnnt():
test_model(mdl, x, y)
def spec_bert():
from models.bert import BertForQuestionAnswering
from extra.models.bert import BertForQuestionAnswering
mdl = BertForQuestionAnswering()
#mdl.load_from_pretrained()
x = Tensor.randn(1, 384)
@@ -51,7 +51,7 @@ def spec_bert():
test_model(mdl, x, am, tt)
def spec_mrcnn():
from models.mask_rcnn import MaskRCNN, ResNet
from extra.models.mask_rcnn import MaskRCNN, ResNet
mdl = MaskRCNN(ResNet(50, num_classes=None, stride_in_1x1=True))
#mdl.load_from_pretrained()
x = Tensor.randn(3, 224, 224)

View File

@@ -8,7 +8,7 @@ from tinygrad.nn import optim
from tinygrad.helpers import getenv
from tinygrad.tensor import Tensor
from extra.datasets import fetch_cifar
from models.efficientnet import EfficientNet
from extra.models.efficientnet import EfficientNet
class TinyConvNet:
def __init__(self, classes=10):

View File

@@ -6,7 +6,7 @@ from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim
from tinygrad.helpers import getenv
from extra.training import train, evaluate
from models.resnet import ResNet
from extra.models.resnet import ResNet
from extra.datasets import fetch_mnist

View File

@@ -5,7 +5,7 @@ import random
from tinygrad.nn.state import get_parameters
from tinygrad.nn.optim import Adam
from extra.training import train, evaluate
from models.transformer import Transformer
from extra.models.transformer import Transformer
# dataset idea from https://github.com/karpathy/minGPT/blob/master/projects/adder/adder.py
def make_dataset():

View File

@@ -4,7 +4,7 @@ import numpy as np
from PIL import Image
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv
from models.vit import ViT
from extra.models.vit import ViT
from extra.utils import fetch
"""
fn = "gs://vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz"

View File

@@ -1,86 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <thread>
#include <chrono>
//#define FN "/dev/nvme0n1"
#define FN "../../weights/LLaMA/7B/consolidated.00.pth"
#define SZ (unsigned long long)(512*1024*1024)
#define CNT 10LL
void test_read() {
#ifdef O_DIRECT
int f = open(FN, O_RDONLY|O_DIRECT);
#else
int f = open(FN, O_RDONLY);
//fcntl(f, F_NOCACHE, 1);
#endif
printf("open %d\n", f);
/*void *buf = malloc(CNT*SZ);
printf("malloc %p\n", buf);
mlock(buf, CNT*SZ);*/
auto t0 = std::chrono::high_resolution_clock::now();
void *buf = mmap(NULL, SZ*CNT, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
auto t1 = std::chrono::high_resolution_clock::now();
mlock(buf, CNT*SZ);
for (int i = 0; i < CNT; i++) {
read(f, (unsigned char*)buf+SZ*i, SZ);
}
auto t2 = std::chrono::high_resolution_clock::now();
//free(buf);
printf("malloc %p\n", buf);
float ns = (float)std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
float pns = (float)std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
printf("read %.2f GB in %.2f s (%.2f s to prepare), %.2f GB/s\n", SZ/1e9*CNT, ns*1e-9, pns*1e-9, (SZ*CNT)/ns);
close(f);
munmap(buf, SZ*CNT);
}
void test_mmap() {
#ifdef O_DIRECT
int f = open(FN, O_RDONLY|O_DIRECT);
#else
int f = open(FN, O_RDONLY);
#endif
printf("open %d\n", f);
void *dat = mmap(NULL, SZ*CNT, PROT_READ, MAP_PRIVATE, f, 0);
auto t1 = std::chrono::high_resolution_clock::now();
mlock(dat, SZ*CNT);
auto t2 = std::chrono::high_resolution_clock::now();
printf("mmap %p\n", dat);
float ns = (float)std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
printf("read %.2f GB in %.2f s, %.2f GB/s\n", SZ/1e9*CNT, ns*1e-9, (SZ*CNT)/ns);
close(f);
munlock(dat, SZ*CNT);
munmap(dat, SZ*CNT);
}
int main() {
//system("sync; echo 1 > /proc/sys/vm/drop_caches");
//system("sudo purge");
//test_mmap();
//system("sync; echo 1 > /proc/sys/vm/drop_caches");
system("sudo purge");
test_read();
test_read();
}

View File

@@ -1 +0,0 @@
a.out

View File

@@ -1,26 +0,0 @@
import sys
import numpy as np
from typing import Dict, Type
from tinygrad.codegen.ast import ASTKernel
from tinygrad.runtime.ops_cpu import CPUBuffer
from tinygrad.ops import DeviceBuffer, map_buffers
in_test = False
test_cnt = 0
def test_ast(k:ASTKernel, device:Type[DeviceBuffer]=CPUBuffer):
global in_test, test_cnt
if in_test: return
in_test = True
print("testing AST", test_cnt)
test_cnt += 1
# TODO: this should only copy the base buffer and retain the shapetracker (requires CPU shapetracker implementation)
cpubufs : Dict[DeviceBuffer, DeviceBuffer] = {x:device.fromCPU(x.toCPU()) for x in k.bufs}
real_out = cpubufs[k.bufs[0]].toCPU()
assert hasattr(device, 'exec_ast')
test_out = device.exec_ast(map_buffers(cpubufs, k.ast)).toCPU()
if not np.allclose(real_out, test_out, atol=1e-4, rtol=1e-4):
print("MISMATCH")
print(k.print())
sys.tracebacklimit = 0
np.testing.assert_allclose(real_out, test_out)
in_test = False

View File

@@ -8,8 +8,8 @@ from tinygrad.tensor import Tensor
from tinygrad.helpers import dtypes
from extra.utils import get_child, download_file
from tinygrad.nn.state import torch_load
from models.resnet import ResNet
from models.retinanet import nms as _box_nms
from extra.models.resnet import ResNet
from extra.models.retinanet import nms as _box_nms
USE_NP_GATHER = os.getenv('FULL_TINYGRAD', '0') == '0'

View File

@@ -1,7 +1,7 @@
import math
from tinygrad.helpers import flatten
import tinygrad.nn as nn
from models.resnet import ResNet
from extra.models.resnet import ResNet
from extra.utils import get_child
import numpy as np
@@ -231,7 +231,7 @@ class FPN:
return results
if __name__ == "__main__":
from models.resnet import ResNeXt50_32X4D
from extra.models.resnet import ResNeXt50_32X4D
backbone = ResNeXt50_32X4D()
retina = RetinaNet(backbone)
retina.load_from_pretrained()

View File

@@ -1,6 +1,6 @@
import numpy as np
from tinygrad.tensor import Tensor
from models.transformer import TransformerBlock
from extra.models.transformer import TransformerBlock
class ViT:
def __init__(self, layers=12, embed_dim=192, num_heads=3):

View File

@@ -1,30 +0,0 @@
import torch
import triton
import triton.language as tl
from triton.compiler import compile
from triton.runtime import JITFunction
def program(b0, b1, b2):
idx = tl.program_id(0)
x = tl.load(b1 + idx)
y = tl.load(b2 + idx)
tl.store(b0 + idx, x+y)
program_jit = JITFunction(program)
# JITFunction(__main__:program) {'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32'}, 'device': 0, 'constants': {}, 'num_warps': 4, 'num_stages': 3, 'extern_libs': None, 'configs': (instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=()),)}
# ast -> ttir -> ttgir -> llir -> ptx -> cubin
compiled = compile(program_jit, signature={0: '*fp32', 1: '*fp32', 2: '*fp32'})
print(compiled.asm['ast'])
print(compiled.asm['ttir'])
#print(compiled.asm['ttgir'])
print(eval(compiled.asm['llir']).decode('utf-8'))
#print(compiled.asm['ptx'])
print("running")
size = 4
x = torch.ones(size, device='cuda')
y = torch.ones(size, device='cuda')
output = torch.empty_like(x)
out = compiled[(output.numel(),1,1)](output, x, y)
print(output)

View File

@@ -25,7 +25,6 @@ exclude = [
"docs/",
"examples/",
"extra/",
"models/",
"openpilot/",
]

View File

@@ -1,208 +0,0 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps
from tinygrad.shape.shapetracker import ShapeTracker, View, ZeroView
from tinygrad.runtime.ops_gpu import GPUBuffer, CLProgram, CLCodegen
#from tinygrad.runtime.ops_metal import MetalBuffer as GPUBuffer, MetalProgram as CLProgram, MetalCodegen as CLCodegen
from tinygrad.helpers import getenv
from extra.lib_test_ast import test_ast
import platform
OSX = platform.system() == "Darwin"
def compile_and_test_ast(ast, local_size=None):
k = CLCodegen(ast)
prg = k.codegen().build(CLProgram)
if local_size is not None: prg.local_size = local_size
for i in range(5): prg(prg.lower(k.bufs))
if getenv("TEST", 0): test_ast(k)
class TestAST(unittest.TestCase):
def test_conv_zeroview_ast(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (2, 2, 2, 1), -3), ZeroView((1, 1, 1, 2), ((0, 1), (0, 1), (-1, 2), (-1, 3))), View((1, 1, 3, 4), (0, 0, 4, 1), 0)]))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (0, 0, 0, 0), 0)]))
op1 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
ast = LazyOp(UnaryOps.RELU, (op1,), None)
compile_and_test_ast(ast)
def test_cifar_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 64, 34, 34), (65536, 1024, 32, 1), -33), ZeroView((512, 64, 32, 32), ((0, 512), (0, 64), (-1, 33), (-1, 33))), View((512, 1, 128, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 1, 128, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 64, 3, 3), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
ast = LazyOp(ReduceOps.SUM, (op0,), (512, 1, 128, 32, 32, 1, 1, 1))
compile_and_test_ast(ast)
def test_cifar_conv_backward(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 512, 10, 10), (64, 16384, 8, 1), -9), ZeroView((256, 512, 8, 8), ((0, 256), (0, 512), (-1, 9), (-1, 9))), View((256, 1, 512, 3, 3, 512, 8, 8), (51200, 51200, 0, 10, 1, 100, 10, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 256, 8, 8), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 1, 512, 3, 3, 512, 8, 8), (0, 0, 64, 0, 0, 32768, 8, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 512, 8, 8), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (256, 1, 512, 3, 3, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (256, 512, 3, 3))
compile_and_test_ast(ast)
def test_first_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 130, 258, 1, 12), (393216, 3072, 12, 12, 1), -3084), ZeroView((1, 128, 256, 1, 12), ((0, 1), (-1, 129), (-1, 257), (0, 1), (0, 12))), View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 6192, 24, 0, 0, 3096, 12, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 768, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 0, 0, 432, 4, 144, 16, 48, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 108, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
compile_and_test_ast(ast)
def test_second_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 66, 130, 32, 1), (262144, 4096, 32, 1, 1), -4128), ZeroView((1, 64, 128, 32, 1), ((0, 1), (-1, 65), (-1, 129), (0, 32), (0, 1))), View((1, 64, 128, 8, 4, 1, 1, 3, 3), (266240, 4160, 32, 4, 1, 12480, 12480, 4160, 32), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 64, 128, 8, 4, 1, 1, 3, 3), (0, 0, 0, 36, 1, 0, 0, 12, 4), 0)]), hostbuf=GPUBuffer(shape=(8, 9, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
compile_and_test_ast(ast)
def test_third_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4))
compile_and_test_ast(ast)
# VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op
# 164 time 27.75 ms running re_S128_4 with [128] None count 4 runtime 1016.06 us 2.07 GFLOPS () -> (128, 1)
# 169 time 22.51 ms running matmul with [4, 16, 128] [4, 16, 16] count 5 runtime 110.08 us 19.06 GFLOPS ('-DMATMUL',) -> (128, 1)
def test_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 128, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 128, 4, 1, 1, 1, 1), (512, 512, 512, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
compile_and_test_ast(ast)
def test_alt_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
compile_and_test_ast(ast)
# re_S32_16_36_6 is fast
def test_1x1_36_6(self): # 36 <- 6
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 1536, 24, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 0, 0, 0, 0, 96, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(36, 24, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(144,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op6 = LazyOp(BinaryOps.MAX, (op5,buf5,), None)
op7 = LazyOp(BinaryOps.SUB, (op3,op6,), None)
ast = LazyOp(MovementOps.RESHAPE, (op7,), (32, 2304, 4))
compile_and_test_ast(ast, None if OSX else (16, 16, 4))
# re_S32_16_6_36 is slow
def test_1x1_6_36(self): # 6 <- 36
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 9216, 144, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 2304, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 0, 0, 0, 0, 576, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 144, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 1536, 24, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
op3 = LazyOp(BinaryOps.ADD, (op2,buf3,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (32, 384, 4))
compile_and_test_ast(ast, (6, 16, 4))
# re_S32_16_6_24
def test_1x1_6_24(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 6144, 96, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 1536, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 0, 0, 0, 0, 384, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 96, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
#buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
#op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 384, 4))
compile_and_test_ast(ast, (6, 4, 8))
def test_full_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (512, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op0 = LazyOp(BinaryOps.ADD, (buf0,buf1,), None)
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([2.], dtype=np.float32)))
op1 = LazyOp(BinaryOps.POW, (op0,buf2,), None)
op2 = LazyOp(ReduceOps.SUM, (op1,), (1, 1))
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([0.5], dtype=np.float32)))
op3 = LazyOp(BinaryOps.POW, (op2,buf3,), None)
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
op4 = LazyOp(BinaryOps.SUB, (op3,buf4,), None)
op5 = LazyOp(UnaryOps.RELU, (op4,), None)
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
op6 = LazyOp(BinaryOps.ADD, (op5,buf5,), None)
buf6 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([3.4e+38], dtype=np.float32)))
op7 = LazyOp(BinaryOps.SUB, (op3,buf6,), None)
op8 = LazyOp(UnaryOps.RELU, (op7,), None)
op9 = LazyOp(BinaryOps.SUB, (op6,op8,), None)
op10 = LazyOp(UnaryOps.RECIPROCAL, (op9,), None)
ast = LazyOp(MovementOps.RESHAPE, (op10,), (1, 1))
compile_and_test_ast(ast)
def test_1239_reduce(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 64, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 1024, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(1239, 256,
4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1239, 4, 1, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 1, 1, 1, 4956))
compile_and_test_ast(ast)
def test_enet_first_conv_bs32(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 1, 32, 112, 112, 3, 3, 3), (0, 0, 27, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 3, 3), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (8, 1, 32, 112, 112, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (8, 32, 112, 112))
compile_and_test_ast(ast)
def test_enet_reduce_bs32(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 1, 32, 3, 3, 32, 112, 112), (0, 12845056, 401408, 0, 0, 12544, 112, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 32, 1, 1, 32, 112, 112), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (3, 1, 32, 3, 3, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (3, 32, 3, 3))
compile_and_test_ast(ast)
if __name__ == '__main__':
unittest.main()

View File

@@ -33,10 +33,10 @@ class CLCache:
if self.allowed is not None:
assert len(cache) <= self.allowed and (not self.strict or len(cache) == self.allowed), f"used too many kernels! {len(cache)} > {self.allowed}"
from models.convnext import ConvNeXt
from models.efficientnet import EfficientNet
from models.resnet import ResNet18
from models.vit import ViT
from extra.models.convnext import ConvNeXt
from extra.models.efficientnet import EfficientNet
from extra.models.resnet import ResNet18
from extra.models.vit import ViT
from tinygrad.nn.state import get_parameters
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")

View File

@@ -23,7 +23,7 @@ def set_equal_weights(mdl, torch_mdl):
class TestBert(unittest.TestCase):
def test_questions(self):
from models.bert import BertForQuestionAnswering
from extra.models.bert import BertForQuestionAnswering
from transformers import BertForQuestionAnswering as TorchBertForQuestionAnswering
from transformers import BertConfig

View File

@@ -8,9 +8,9 @@ from PIL import Image
from tinygrad.helpers import getenv
from tinygrad.tensor import Tensor
from models.efficientnet import EfficientNet
from models.vit import ViT
from models.resnet import ResNet50
from extra.models.efficientnet import EfficientNet
from extra.models.vit import ViT
from extra.models.resnet import ResNet50
def _load_labels():
labels_filename = pathlib.Path(__file__).parent / 'efficientnet/imagenet1000_clsidx_to_labels.txt'

View File

@@ -2,7 +2,7 @@
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from models.rnnt import LSTM
from extra.models.rnnt import LSTM
import torch
class TestRNNT(unittest.TestCase):

View File

@@ -6,11 +6,11 @@ from tinygrad.nn import optim
from tinygrad.tensor import Device
from tinygrad.helpers import getenv
from extra.training import train
from models.convnext import ConvNeXt
from models.efficientnet import EfficientNet
from models.transformer import Transformer
from models.vit import ViT
from models.resnet import ResNet18
from extra.models.convnext import ConvNeXt
from extra.models.efficientnet import EfficientNet
from extra.models.transformer import Transformer
from extra.models.vit import ViT
from extra.models.resnet import ResNet18
import pytest
pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]

View File

@@ -303,7 +303,7 @@ class TestSchedule(unittest.TestCase):
check_schedule(x, 3)
def test_resnet_block(self):
from models.resnet import BasicBlock
from extra.models.resnet import BasicBlock
Tensor.training = False
bb = BasicBlock(64,64)

View File

@@ -70,7 +70,7 @@ class TestSafetensors(unittest.TestCase):
for k,v in tensors.items(): np.testing.assert_array_equal(ret2[k].numpy(), v.numpy())
def test_efficientnet_safetensors(self):
from models.efficientnet import EfficientNet
from extra.models.efficientnet import EfficientNet
model = EfficientNet(0)
state_dict = get_state_dict(model)
safe_save(state_dict, temp("eff0"))