Add openpilot tests

2024-03-06 14:58:47 -07:00
parent 2901597132
commit b39097a12d
259 changed files with 31176 additions and 12 deletions
--- a/tinygrad_repo/test/models/efficientnet/Chicken.jpg
+++ b/tinygrad_repo/test/models/efficientnet/Chicken.jpg
--- a/tinygrad_repo/test/models/efficientnet/car.jpg
+++ b/tinygrad_repo/test/models/efficientnet/car.jpg
--- a/tinygrad_repo/test/models/efficientnet/imagenet1000_clsidx_to_labels.txt
+++ b/tinygrad_repo/test/models/efficientnet/imagenet1000_clsidx_to_labels.txt
--- a/tinygrad_repo/test/models/test_bert.py
+++ b/tinygrad_repo/test/models/test_bert.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+import torch
+
+def get_question_samp(bsz, seq_len, vocab_size, seed):
+  np.random.seed(seed)
+  in_ids= np.random.randint(vocab_size, size=(bsz, seq_len))
+  mask = np.random.choice([True, False], size=(bsz, seq_len))
+  seg_ids = np.random.randint(1, size=(bsz, seq_len))
+  return in_ids, mask, seg_ids
+
+def set_equal_weights(mdl, torch_mdl):
+  from tinygrad.nn.state import get_state_dict
+  state, torch_state = get_state_dict(mdl), torch_mdl.state_dict()
+  assert len(state) == len(torch_state)
+  for k, v in state.items():
+    assert k in torch_state
+    torch_state[k].copy_(torch.from_numpy(v.numpy()))
+  torch_mdl.eval()
+
+class TestBert(unittest.TestCase):
+  def test_questions(self):
+    from models.bert import BertForQuestionAnswering
+    from transformers import BertForQuestionAnswering as TorchBertForQuestionAnswering
+    from transformers import BertConfig
+
+    # small
+    config = {
+      'vocab_size':24, 'hidden_size':2, 'num_hidden_layers':2, 'num_attention_heads':2,
+      'intermediate_size':32, 'hidden_dropout_prob':0.1, 'attention_probs_dropout_prob':0.1,
+      'max_position_embeddings':512, 'type_vocab_size':2
+      }
+
+    # Create in tinygrad
+    Tensor.manual_seed(1337)
+    mdl = BertForQuestionAnswering(**config)
+
+    # Create in torch
+    with torch.no_grad():
+      torch_mdl = TorchBertForQuestionAnswering(BertConfig(**config))
+
+    set_equal_weights(mdl, torch_mdl)
+
+    seeds = (1337, 3141)
+    bsz, seq_len = 1, 16
+    for _, seed in enumerate(seeds):
+      in_ids, mask, seg_ids = get_question_samp(bsz, seq_len, config['vocab_size'], seed)
+      out = mdl(Tensor(in_ids), Tensor(mask), Tensor(seg_ids))
+      torch_out = torch_mdl.forward(torch.from_numpy(in_ids).long(), torch.from_numpy(mask), torch.from_numpy(seg_ids).long())[:2]
+      torch_out = torch.cat(torch_out).unsqueeze(2)
+      np.testing.assert_allclose(out.numpy(), torch_out.detach().numpy(), atol=5e-4, rtol=5e-4)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_efficientnet.py
+++ b/tinygrad_repo/test/models/test_efficientnet.py
@@ -0,0 +1,115 @@
+import ast
+import pathlib
+import sys
+import unittest
+
+import numpy as np
+from PIL import Image
+
+from tinygrad.helpers import getenv
+from tinygrad.tensor import Tensor
+from models.efficientnet import EfficientNet
+from models.vit import ViT
+from models.resnet import ResNet50
+
+def _load_labels():
+  labels_filename = pathlib.Path(__file__).parent / 'efficientnet/imagenet1000_clsidx_to_labels.txt'
+  return ast.literal_eval(labels_filename.read_text())
+
+_LABELS = _load_labels()
+
+def preprocess(img, new=False):
+  # preprocess image
+  aspect_ratio = img.size[0] / img.size[1]
+  img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
+
+  img = np.array(img)
+  y0, x0 =(np.asarray(img.shape)[:2] - 224) // 2
+  img = img[y0: y0 + 224, x0: x0 + 224]
+
+  # low level preprocess
+  if new:
+    img = img.astype(np.float32)
+    img -= [127.0, 127.0, 127.0]
+    img /= [128.0, 128.0, 128.0]
+    img = img[None]
+  else:
+    img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
+    img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
+    img /= 255.0
+    img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
+    img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
+  return img
+
+
+def _infer(model: EfficientNet, img, bs=1):
+  Tensor.training = False
+  img = preprocess(img)
+  # run the net
+  if bs > 1: img = img.repeat(bs, axis=0)
+  out = model.forward(Tensor(img)).cpu()
+  return _LABELS[np.argmax(out.numpy()[0])]
+
+chicken_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg')
+car_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg')
+
+class TestEfficientNet(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = EfficientNet(number=getenv("NUM"))
+    cls.model.load_from_pretrained()
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    label = _infer(self.model, chicken_img)
+    self.assertEqual(label, "hen")
+
+  def test_chicken_bigbatch(self):
+    label = _infer(self.model, chicken_img, 2)
+    self.assertEqual(label, "hen")
+
+  def test_car(self):
+    label = _infer(self.model, car_img)
+    self.assertEqual(label, "sports car, sport car")
+
+class TestViT(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = ViT()
+    cls.model.load_from_pretrained()
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    label = _infer(self.model, chicken_img)
+    self.assertEqual(label, "cock")
+
+  def test_car(self):
+    label = _infer(self.model, car_img)
+    self.assertEqual(label, "racer, race car, racing car")
+
+class TestResNet(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = ResNet50()
+    cls.model.load_from_pretrained()
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    label = _infer(self.model, chicken_img)
+    self.assertEqual(label, "hen")
+
+  def test_car(self):
+    label = _infer(self.model, car_img)
+    self.assertEqual(label, "sports car, sport car")
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_end2end.py
+++ b/tinygrad_repo/test/models/test_end2end.py
@@ -0,0 +1,165 @@
+import torch
+from torch import nn
+import unittest
+import numpy as np
+from tinygrad.nn.state import get_parameters, get_state_dict
+from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
+from tinygrad.tensor import Tensor
+from extra.datasets import fetch_mnist
+from tinygrad.helpers import CI
+
+def compare_tiny_torch(model, model_torch, X, Y):
+  with Tensor.train():
+    model_torch.train()
+    model_state_dict = get_state_dict(model)
+    for k,v in model_torch.named_parameters():
+      if not CI: print(f"initting {k} from torch")
+      model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
+
+    optimizer = optim.SGD(get_parameters(model), lr=0.001)
+    optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.001)
+
+    Xt = torch.Tensor(X.numpy())
+    np.testing.assert_allclose(X.numpy(), Xt.detach().numpy())
+
+    out = model(X)
+    loss = (out * Y).mean()
+    if not CI: print(loss.realize().numpy())
+
+    out_torch = model_torch(torch.Tensor(X.numpy()))
+    loss_torch = (out_torch * torch.Tensor(Y.numpy())).mean()
+    if not CI: print(loss_torch.detach().numpy())
+
+    # assert losses match
+    np.testing.assert_allclose(loss.realize().numpy(), loss_torch.detach().numpy(), atol=1e-4)
+
+    # zero and backward
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer_torch.zero_grad()
+    loss_torch.backward()
+
+    for k,v in list(model_torch.named_parameters())[::-1]:
+      g = model_state_dict[k].grad.numpy()
+      gt = v.grad.detach().numpy()
+      if not CI: print("testing grads", k)
+      np.testing.assert_allclose(g, gt, atol=1e-3, err_msg=f'grad mismatch {k}')
+
+    # take the steps
+    optimizer.step()
+    optimizer_torch.step()
+
+    # assert weights match (they don't!)
+    for k,v in model_torch.named_parameters():
+      if not CI: print("testing weight", k)
+      np.testing.assert_allclose(model_state_dict[k].numpy(), v.detach().numpy(), atol=1e-3, err_msg=f'weight mismatch {k}')
+
+def get_mnist_data():
+  X_train, Y_train, X_test, Y_test = fetch_mnist()
+  BS = 32
+  num_classes = 10
+  X = Tensor(X_test[0:BS].astype(np.float32))
+  Y = np.zeros((BS, num_classes), np.float32)
+  Y[range(BS),Y_test[0:BS]] = -1.0*num_classes
+  return X, Tensor(Y)
+
+class TestEnd2End(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.X, cls.Y = get_mnist_data()
+
+  def setUp(self):
+    torch.manual_seed(123)
+
+  def test_linear_mnist(self):
+    class LinTiny:
+      def __init__(self, has_batchnorm=False):
+        self.l1 = Linear(784, 128)
+        self.l2 = Linear(128, 10)
+        self.bn1 = BatchNorm2d(128) if has_batchnorm else lambda x: x
+      def __call__(self, x):
+        return self.l2(self.l1(x)).relu().log_softmax(-1)
+    class LinTorch(nn.Module):
+      def __init__(self, has_batchnorm=False):
+        super().__init__()
+        self.l1 = nn.Linear(784, 128)
+        self.l2 = nn.Linear(128, 10)
+      def forward(self, x):
+        return self.l2(self.l1(x)).relu().log_softmax(-1)
+    compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
+
+  def test_bn_mnist(self):
+    class LinTiny:
+      def __init__(self):
+        self.l1 = Linear(784, 128)
+        self.l2 = Linear(128, 10)
+        self.bn1 = BatchNorm2d(128)
+      def __call__(self, x):
+        return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
+    class LinTorch(nn.Module):
+      def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(784, 128)
+        self.l2 = nn.Linear(128, 10)
+        self.bn1 = nn.BatchNorm2d(128)
+      def forward(self, x):
+        return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
+    compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
+
+  def test_bn_alone(self):
+    np.random.seed(1337)
+    X = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
+    Y = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
+    compare_tiny_torch(BatchNorm2d(10), nn.BatchNorm2d(10), X, Y)
+
+  def test_bn_linear(self):
+    BS, K = 2, 1
+    eps = 0
+    X = Tensor([1,0]).reshape(BS, K, 1, 1)
+    Y = Tensor([-1,0]).reshape(BS, K, 1, 1)
+    class LinTiny:
+      def __init__(self):
+        self.l1 = Conv2d(K, K, 1, bias=False)
+        self.bn1 = BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
+      def __call__(self, x): return self.bn1(self.l1(x))
+    class LinTorch(nn.Module):
+      def __init__(self):
+        super().__init__()
+        self.l1 = nn.Conv2d(K, K, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
+      def forward(self, x): return self.bn1(self.l1(x))
+    model_torch = LinTorch()
+    with torch.no_grad():
+      model_torch.l1.weight[:] = 1.
+    compare_tiny_torch(LinTiny(), model_torch, X, Y)
+
+  def test_conv_mnist(self):
+    class LinTiny:
+      def __init__(self, has_batchnorm=False):
+        self.c1 = Conv2d(1, 8, 3, stride=2)
+        self.c2 = Conv2d(8, 16, 3, stride=2)
+        self.l1 = Linear(16*6*6, 10)
+        if has_batchnorm:
+          self.bn1, self.bn2 = BatchNorm2d(8), BatchNorm2d(16)
+        else:
+          self.bn1, self.bn2 = lambda x: x, lambda x: x
+      def __call__(self, x):
+        return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
+    class LinTorch(nn.Module):
+      def __init__(self, has_batchnorm=False):
+        super().__init__()
+        self.c1 = nn.Conv2d(1, 8, 3, stride=2)
+        self.c2 = nn.Conv2d(8, 16, 3, stride=2)
+        self.l1 = nn.Linear(16*6*6, 10)
+        if has_batchnorm:
+          self.bn1, self.bn2 = nn.BatchNorm2d(8), nn.BatchNorm2d(16)
+        else:
+          self.bn1, self.bn2 = lambda x: x, lambda x: x
+      def forward(self, x):
+        return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
+    for has_batchnorm in [False, True]:
+      with self.subTest(has_batchnorm=has_batchnorm):
+        compare_tiny_torch(LinTiny(has_batchnorm), LinTorch(has_batchnorm), self.X.reshape((-1, 1, 28, 28)), self.Y)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/models/test_mnist.py
+++ b/tinygrad_repo/test/models/test_mnist.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.nn.state import get_parameters
+from tinygrad.tensor import Tensor, Device
+from tinygrad.nn import optim, BatchNorm2d
+from extra.training import train, evaluate
+from extra.datasets import fetch_mnist
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+# load the mnist dataset
+X_train, Y_train, X_test, Y_test = fetch_mnist()
+
+# create a model
+class TinyBobNet:
+  def __init__(self):
+    self.l1 = Tensor.scaled_uniform(784, 128)
+    self.l2 = Tensor.scaled_uniform(128, 10)
+
+  def parameters(self):
+    return get_parameters(self)
+
+  def forward(self, x):
+    return x.dot(self.l1).relu().dot(self.l2).log_softmax()
+
+# create a model with a conv layer
+class TinyConvNet:
+  def __init__(self, has_batchnorm=False):
+    # https://keras.io/examples/vision/mnist_convnet/
+    conv = 3
+    #inter_chan, out_chan = 32, 64
+    inter_chan, out_chan = 8, 16   # for speed
+    self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
+    self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
+    self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)
+    if has_batchnorm:
+      self.bn1 = BatchNorm2d(inter_chan)
+      self.bn2 = BatchNorm2d(out_chan)
+    else:
+      self.bn1, self.bn2 = lambda x: x, lambda x: x
+
+  def parameters(self):
+    return get_parameters(self)
+
+  def forward(self, x:Tensor):
+    x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
+    x = self.bn1(x.conv2d(self.c1)).relu().max_pool2d()
+    x = self.bn2(x.conv2d(self.c2)).relu().max_pool2d()
+    x = x.reshape(shape=[x.shape[0], -1])
+    return x.dot(self.l1).log_softmax()
+
+class TestMNIST(unittest.TestCase):
+  def test_sgd_onestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=1)
+    for p in model.parameters(): p.realize()
+
+  def test_sgd_threestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=3)
+
+  def test_sgd_sixstep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=6, noloss=True)
+
+  def test_adam_onestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=1)
+    for p in model.parameters(): p.realize()
+
+  def test_adam_threestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=3)
+
+  def test_conv_onestep(self):
+    np.random.seed(1337)
+    model = TinyConvNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=1, noloss=True)
+    for p in model.parameters(): p.realize()
+
+  def test_conv(self):
+    np.random.seed(1337)
+    model = TinyConvNet()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, steps=100)
+    assert evaluate(model, X_test, Y_test) > 0.93   # torch gets 0.9415 sometimes
+
+  def test_conv_with_bn(self):
+    np.random.seed(1337)
+    model = TinyConvNet(has_batchnorm=True)
+    optimizer = optim.AdamW(model.parameters(), lr=0.003)
+    train(model, X_train, Y_train, optimizer, steps=200)
+    assert evaluate(model, X_test, Y_test) > 0.94
+
+  def test_sgd(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, steps=600)
+    assert evaluate(model, X_test, Y_test) > 0.94   # CPU gets 0.9494 sometimes
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_onnx.py
+++ b/tinygrad_repo/test/models/test_onnx.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+import os
+import time
+import io
+import unittest
+import numpy as np
+import onnx
+from extra.utils import fetch, temp
+from extra.onnx import get_run_onnx
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import CI
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+def run_onnx_torch(onnx_model, inputs):
+  import torch
+  from onnx2torch import convert
+  torch_model = convert(onnx_model).float()
+  with torch.no_grad():
+    torch_out = torch_model(*[torch.tensor(x) for x in inputs.values()])
+  return torch_out
+
+OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx"
+
+np.random.seed(1337)
+
+class TestOnnxModel(unittest.TestCase):
+  def test_benchmark_openpilot_model(self):
+    dat = fetch(OPENPILOT_MODEL)
+    onnx_model = onnx.load(io.BytesIO(dat))
+    run_onnx = get_run_onnx(onnx_model)
+    def get_inputs():
+      np_inputs = {
+        "input_imgs": np.random.randn(*(1, 12, 128, 256)),
+        "big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
+        "desire": np.zeros((1, 100, 8)),
+        "traffic_convention": np.array([[1., 0.]]),
+        "nav_features": np.zeros((1, 256)),
+        "features_buffer": np.zeros((1, 99, 128)),
+    }
+      inputs = {k:Tensor(v.astype(np.float32), requires_grad=False) for k,v in np_inputs.items()}
+      return inputs
+
+    for _ in range(7):
+      inputs = get_inputs()
+      st = time.monotonic()
+      tinygrad_out = run_onnx(inputs)['outputs']
+      mt = time.monotonic()
+      tinygrad_out.realize()
+      mt2 = time.monotonic()
+      tinygrad_out = tinygrad_out.numpy()
+      et = time.monotonic()
+      if not CI: print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
+
+    if not CI:
+      import cProfile
+      import pstats
+      inputs = get_inputs()
+      pr = cProfile.Profile(timer=time.perf_counter_ns, timeunit=1e-6)
+      pr.enable()
+    tinygrad_out = run_onnx(inputs)['outputs']
+    tinygrad_out.realize()
+    tinygrad_out = tinygrad_out.numpy()
+    if not CI:
+      pr.disable()
+      stats = pstats.Stats(pr)
+      stats.dump_stats(temp("net.prof"))
+      os.system(f"flameprof {temp('net.prof')} > {temp('prof.svg')}")
+      ps = stats.sort_stats(pstats.SortKey.TIME)
+      ps.print_stats(30)
+
+  def test_openpilot_model(self):
+    dat = fetch(OPENPILOT_MODEL)
+    onnx_model = onnx.load(io.BytesIO(dat))
+    run_onnx = get_run_onnx(onnx_model)
+    print("got run_onnx")
+    inputs = {
+      "input_imgs": np.random.randn(*(1, 12, 128, 256)),
+      "big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
+      "desire": np.zeros((1, 100, 8)),
+      "traffic_convention": np.array([[1., 0.]]),
+      "nav_features": np.zeros((1, 256)),
+      "features_buffer": np.zeros((1, 99, 128)),
+    }
+    inputs = {k:v.astype(np.float32) for k,v in inputs.items()}
+
+    st = time.monotonic()
+    print("****** run onnx ******")
+    tinygrad_out = run_onnx(inputs)['outputs']
+    mt = time.monotonic()
+    print("****** realize ******")
+    tinygrad_out.realize()
+    mt2 = time.monotonic()
+    tinygrad_out = tinygrad_out.numpy()
+    et = time.monotonic()
+    print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
+
+    Tensor.no_grad = True
+    torch_out = run_onnx_torch(onnx_model, inputs).numpy()
+    Tensor.no_grad = False
+    print(tinygrad_out, torch_out)
+    np.testing.assert_allclose(torch_out, tinygrad_out, atol=1e-4, rtol=1e-2)
+
+  def test_efficientnet(self):
+    dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx")
+    input_name, input_new = "images:0", True
+    self._test_model(dat, input_name, input_new)
+
+  def test_shufflenet(self):
+    dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx")
+    print(f"shufflenet downloaded : {len(dat)/1e6:.2f} MB")
+    input_name, input_new = "gpu_0/data_0", False
+    self._test_model(dat, input_name, input_new)
+
+  @unittest.skip("test is very slow")
+  def test_resnet(self):
+    # NOTE: many onnx models can't be run right now due to max pool with strides != kernel_size
+    dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx")
+    print(f"resnet downloaded : {len(dat)/1e6:.2f} MB")
+    input_name, input_new = "data", False
+    self._test_model(dat, input_name, input_new)
+
+  def _test_model(self, dat, input_name, input_new, debug=False):
+    onnx_model = onnx.load(io.BytesIO(dat))
+    print("onnx loaded")
+    from test.models.test_efficientnet import chicken_img, car_img, preprocess, _LABELS
+    run_onnx = get_run_onnx(onnx_model)
+
+    def run(img):
+      inputs = {input_name: preprocess(img, new=input_new)}
+      tinygrad_out = list(run_onnx(inputs, debug=debug).values())[0].numpy()
+      return tinygrad_out.argmax()
+
+    cls = run(chicken_img)
+    print(cls, _LABELS[cls])
+    assert _LABELS[cls] == "hen" or _LABELS[cls] == "cock"
+    cls = run(car_img)
+    print(cls, _LABELS[cls])
+    assert "car" in _LABELS[cls] or _LABELS[cls] == "convertible"
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/models/test_real_world.py
+++ b/tinygrad_repo/test/models/test_real_world.py
@@ -0,0 +1,100 @@
+import unittest, time
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.nn import optim
+from tinygrad.nn.state import get_parameters
+from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
+from tinygrad.ops import Device, GlobalCounters
+from tinygrad.helpers import CI, dtypes, getenv, prod
+from test.helpers import derandomize_model
+
+from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
+from examples.hlb_cifar10 import SpeedyResNet
+from examples.llama import Transformer as LLaMaTransformer, MODEL_PARAMS as LLAMA_MODEL_PARAMS
+from examples.stable_diffusion import UNetModel
+
+def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed, all_jitted=False):
+  tms = []
+  for _ in range(4):
+    GlobalCounters.reset()
+    GlobalCounters.mem_used = 0
+    Device[Device.DEFAULT].synchronize()
+    st = time.perf_counter_ns()
+    train(*gen())
+    Device[Device.DEFAULT].synchronize()
+    tms.append(time.perf_counter_ns() - st)
+
+  kernels_used = len(train.jit_cache) if hasattr(train, "jit_cache") else None
+  print(f"{nm}: used {GlobalCounters.mem_used/1e9:.2f} GB and {kernels_used} kernels in {min(tms)/1e6:.2f} ms")
+  assert GlobalCounters.mem_used/1e9 < max_memory_allowed, f"{nm} used more than {max_memory_allowed:.2f} GB"
+  assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels"
+  if all_jitted:
+    assert kernels_used > 0 and kernels_used == GlobalCounters.kernel_count, f"only {kernels_used} out of {GlobalCounters.kernel_count} were jitted"
+
+class TestRealWorld(unittest.TestCase):
+  def setUp(self):
+    self.old_type = Tensor.default_type
+    np.random.seed(2002)
+
+  def tearDown(self):
+    Tensor.default_type = self.old_type
+
+  @unittest.skipUnless(not CI, "too big for CI")
+  def test_stable_diffusion(self):
+    model = UNetModel()
+    derandomize_model(model)
+    @TinyJit
+    def test(t, t2): return model(t, 801, t2).realize()
+    helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, 768)), test, 18.0, 967)
+
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
+  def test_llama(self):
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    model = LLaMaTransformer(**(args_tiny if CI else LLAMA_MODEL_PARAMS["1"]["7B"]["args"]))
+    derandomize_model(model)
+    @TinyJit
+    def test(t): return model(t, 0).realize()
+    # NOTE: only test one pass, not testing the dynamic shape autoregressive part
+    helper_test("test_llama", lambda: (Tensor([[1,]]),), test, 0.22 if CI else 13.5, 126 if CI else 486, all_jitted=True)
+
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM"] or not CI), "needs JIT, too long on CI LLVM")
+  def test_gpt2(self):
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000}
+    model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"]))
+    derandomize_model(model)
+    @TinyJit
+    def test(t): return model(t, 0).realize()
+    helper_test("test_gpt2", lambda: (Tensor([[1,]]),), test, 0.21 if CI else 0.9, 129 if CI else 369, all_jitted=True)
+
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM", "CLANG"] or not CI), "needs JIT, too long on CI LLVM and CLANG")
+  def test_train_cifar(self):
+    # TODO: with default device
+    #old_default = Device.DEFAULT
+    #Device.DEFAULT = "FAKE"
+    #Device['fake'].codegen = Device[old_default].codegen
+
+    with Tensor.train():
+      model = SpeedyResNet(Tensor.ones((12,3,2,2)))
+      optimizer = optim.SGD(get_parameters(model), lr=0.01, momentum=0.8, nesterov=True, weight_decay=0.15)
+
+      BS = 32 if CI else 512
+
+      @TinyJit
+      def train(X):
+        out = model(X)
+        loss = out.mean()
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+      helper_test("train_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), train, (1.0/48)*BS, 154)   # it's 154 on metal
+
+      # reset device
+      #Device.DEFAULT = old_default
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_rnnt.py
+++ b/tinygrad_repo/test/models/test_rnnt.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from models.rnnt import LSTM
+import torch
+
+class TestRNNT(unittest.TestCase):
+  def test_lstm(self):
+    BS, SQ, IS, HS, L = 2, 20, 40, 128, 2
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.LSTM(IS, HS, L)
+
+    # create in tinygrad
+    layer = LSTM(IS, HS, L, 0.0)
+
+    # copy weights
+    with torch.no_grad():
+      layer.cells[0].weights_ih.assign(Tensor(torch_layer.weight_ih_l0.numpy()))
+      layer.cells[0].weights_hh.assign(Tensor(torch_layer.weight_hh_l0.numpy()))
+      layer.cells[0].bias_ih.assign(Tensor(torch_layer.bias_ih_l0.numpy()))
+      layer.cells[0].bias_hh.assign(Tensor(torch_layer.bias_hh_l0.numpy()))
+      layer.cells[1].weights_ih.assign(Tensor(torch_layer.weight_ih_l1.numpy()))
+      layer.cells[1].weights_hh.assign(Tensor(torch_layer.weight_hh_l1.numpy()))
+      layer.cells[1].bias_ih.assign(Tensor(torch_layer.bias_ih_l1.numpy()))
+      layer.cells[1].bias_hh.assign(Tensor(torch_layer.bias_hh_l1.numpy()))
+
+    # test initial hidden
+    for _ in range(3):
+      x = Tensor.randn(SQ, BS, IS)
+      z, hc = layer(x, None)
+      torch_x = torch.tensor(x.numpy())
+      torch_z, torch_hc = torch_layer(torch_x)
+      np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+    # test passing hidden
+    for _ in range(3):
+      x = Tensor.randn(SQ, BS, IS)
+      z, hc = layer(x, hc)
+      torch_x = torch.tensor(x.numpy())
+      torch_z, torch_hc = torch_layer(torch_x, torch_hc)
+      np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_train.py
+++ b/tinygrad_repo/test/models/test_train.py
@@ -0,0 +1,83 @@
+import unittest
+import time
+import numpy as np
+from tinygrad.nn.state import get_parameters
+from tinygrad.nn import optim
+from tinygrad.tensor import Device
+from tinygrad.helpers import getenv
+from extra.training import train
+from models.convnext import ConvNeXt
+from models.efficientnet import EfficientNet
+from models.transformer import Transformer
+from models.vit import ViT
+from models.resnet import ResNet18
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+BS = getenv("BS", 2)
+
+def train_one_step(model,X,Y):
+  params = get_parameters(model)
+  pcount = 0
+  for p in params:
+    pcount += np.prod(p.shape)
+  optimizer = optim.SGD(params, lr=0.001)
+  print("stepping %r with %.1fM params bs %d" % (type(model), pcount/1e6, BS))
+  st = time.time()
+  train(model, X, Y, optimizer, steps=1, BS=BS)
+  et = time.time()-st
+  print("done in %.2f ms" % (et*1000.))
+
+def check_gc():
+  if Device.DEFAULT == "GPU":
+    from extra.introspection import print_objects
+    assert print_objects() == 0
+
+class TestTrain(unittest.TestCase):
+  def test_convnext(self):
+    model = ConvNeXt(depths=[1], dims=[16])
+    X = np.zeros((BS,3,224,224), dtype=np.float32)
+    Y = np.zeros((BS), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  def test_efficientnet(self):
+    model = EfficientNet(0)
+    X = np.zeros((BS,3,224,224), dtype=np.float32)
+    Y = np.zeros((BS), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "too many buffers for webgpu")
+  def test_vit(self):
+    model = ViT()
+    X = np.zeros((BS,3,224,224), dtype=np.float32)
+    Y = np.zeros((BS,), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  def test_transformer(self):
+    # this should be small GPT-2, but the param count is wrong
+    # (real ff_dim is 768*4)
+    model = Transformer(syms=10, maxlen=6, layers=12, embed_dim=768, num_heads=12, ff_dim=768//4)
+    X = np.zeros((BS,6), dtype=np.float32)
+    Y = np.zeros((BS,6), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  def test_resnet(self):
+    X = np.zeros((BS, 3, 224, 224), dtype=np.float32)
+    Y = np.zeros((BS), dtype=np.int32)
+    for resnet_v in [ResNet18]:
+      model = resnet_v()
+      model.load_from_pretrained()
+      train_one_step(model, X, Y)
+    check_gc()
+
+  def test_bert(self):
+    # TODO: write this
+    pass
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_waifu2x.py
+++ b/tinygrad_repo/test/models/test_waifu2x.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+import pathlib
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+
+class TestVGG7(unittest.TestCase):
+  def test_vgg7(self):
+    from examples.vgg7_helpers.waifu2x import Vgg7, image_load
+
+    # Create in tinygrad
+    Tensor.manual_seed(1337)
+    mdl = Vgg7()
+    mdl.load_from_pretrained()
+
+    # Scale up an image
+    test_x = image_load(pathlib.Path(__file__).parent / 'waifu2x/input.png')
+    test_y = image_load(pathlib.Path(__file__).parent / 'waifu2x/output.png')
+    scaled = mdl.forward_tiled(test_x, 156)
+    scaled = np.fmax(0, np.fmin(1, scaled))
+    np.testing.assert_allclose(scaled, test_y, atol=5e-3, rtol=5e-3)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_whisper.py
+++ b/tinygrad_repo/test/models/test_whisper.py
@@ -0,0 +1,25 @@
+import unittest
+import pathlib
+from tinygrad.ops import Device
+from examples.whisper import init_whisper, transcribe_file
+
+@unittest.skipUnless(Device.DEFAULT == "METAL", "Some non-metal backends spend too long trying to allocate a 20GB array")
+class TestWhisper(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    model, enc = init_whisper("tiny.en")
+    cls.model = model
+    cls.enc = enc
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+    del cls.enc
+
+  def test_transcribe_file(self):
+    # Audio generated with the command on MacOS:
+    # say "Could you please let me out of the box?" --file-format=WAVE  --data-format=LEUI8@16000 -o test
+    # We use the WAVE type because it's easier to decode in CI test environments
+    filename = str(pathlib.Path(__file__).parent / "whisper/test.wav")
+    transcription = transcribe_file(self.model, self.enc, filename)
+    self.assertEqual("<|startoftranscript|><|notimestamps|> Could you please let me out of the box?<|endoftext|>",  transcription)
--- a/tinygrad_repo/test/models/waifu2x/input.png
+++ b/tinygrad_repo/test/models/waifu2x/input.png
--- a/tinygrad_repo/test/models/waifu2x/output.png
+++ b/tinygrad_repo/test/models/waifu2x/output.png
--- a/tinygrad_repo/test/models/whisper/test.wav
+++ b/tinygrad_repo/test/models/whisper/test.wav