Add openpilot tests

2024-03-06 14:58:47 -07:00
parent 2901597132
commit b39097a12d
259 changed files with 31176 additions and 12 deletions
--- a/tinygrad_repo/test/external/dist/test_collectives.py
+++ b/tinygrad_repo/test/external/dist/test_collectives.py
@@ -0,0 +1,62 @@
+from extra import dist
+from tinygrad.jit import TinyJit
+if __name__ == "__main__":
+  dist.preinit()
+
+from extra.dist import collectives
+from tinygrad.helpers import CI, getenv
+from tinygrad.tensor import Tensor
+import numpy as np
+
+@TinyJit
+def allreduce_jit(t:Tensor, cache_id=None) -> Tensor:
+  return collectives.allreduce(t, cache_id=cache_id).realize()
+
+SIZE = 2048 if not CI else 2
+SIZE_2 = 255 if not CI else 3
+
+def run():
+  # set a deterministic seed so that both ranks generate the same random tensor
+  Tensor.manual_seed(42)
+
+  rank = getenv("RANK")
+
+  # loop 3 times to make sure it works with the jit
+  for _ in range(3):
+    # create a tensor to send
+    t = Tensor.zeros(SIZE, SIZE) if rank != 0 else Tensor.ones(SIZE, SIZE)
+    t2 = allreduce_jit(t.contiguous().realize(), cache_id="test")
+    assert np.allclose(np.ones((SIZE, SIZE)), t2.numpy()), f"{t2.numpy()} wasn't ones"
+
+  # reset jit
+  allreduce_jit.cnt = 0
+  allreduce_jit.input_replace = {}
+
+  # test uneven chunk sizes
+  for _ in range(3):
+    # create a tensor to send
+    t = Tensor.ones(SIZE_2, SIZE_2, SIZE_2) if rank == 0 else Tensor.zeros(SIZE_2, SIZE_2, SIZE_2)
+    t2 = allreduce_jit(t.contiguous().realize(), cache_id="test2")
+    assert np.allclose(np.ones((SIZE_2, SIZE_2, SIZE_2)), t2.numpy()), f"{t2.numpy()} wasn't ones"
+
+  print(f"rank {rank} passed")
+
+if __name__ == "__main__":
+  if getenv("HIP"):
+    from tinygrad.runtime.ops_hip import HIP
+    devices = [f"hip:{i}" for i in range(HIP.device_count)]
+  else:
+    from tinygrad.runtime.ops_gpu import CL
+    devices = [f"gpu:{i}" for i in range(len(CL.devices))] if not CI else ["gpu:0", "gpu:0"]
+  world_size = len(devices)
+
+  dist.init_oob(world_size)
+
+  processes = []
+  for rank, device in enumerate(devices):
+    processes.append(dist.spawn(rank, device, fn=run, args=()))
+  for p in processes: p.join()
+
+  # exit with error code if any of the processes failed
+  for p in processes:
+    if p.exitcode != 0: exit(p.exitcode)
--- a/tinygrad_repo/test/external/dist/test_world.py
+++ b/tinygrad_repo/test/external/dist/test_world.py
@@ -0,0 +1,68 @@
+from extra import dist
+from tinygrad.jit import TinyJit
+if __name__ == "__main__":
+  dist.preinit()
+
+from extra.dist import world
+from tinygrad.helpers import CI, getenv
+from tinygrad.tensor import Tensor
+import numpy as np
+
+@TinyJit
+def send_jit(t, target_rank, cache_id=None) -> Tensor:
+  return world.send(t, target_rank, cache_id=cache_id).realize()
+
+@TinyJit
+def recv_jit(t, target_rank, cache_id=None) -> Tensor:
+  return world.recv(t, target_rank, cache_id=cache_id).realize()
+
+SIZE = 2048 if not CI else 2
+
+def run():
+  # set a deterministic seed so that both ranks generate the same random tensor
+  Tensor.manual_seed(42)
+
+  rank = getenv("RANK")
+
+  # loop 3 times to make sure it works with the jit
+  for _ in range(3):
+    # create a tensor to send
+    t = Tensor.randn(SIZE, SIZE)
+
+    # send to rank 1
+    if rank == 0:
+      send_jit(t, 1, cache_id="test")
+    elif rank == 1:
+      t2 = Tensor.empty(SIZE, SIZE)
+      recv_jit(t2, 0, cache_id="test")
+
+    # recv from rank 1
+    if rank == 0:
+      t2 = Tensor.empty(SIZE, SIZE)
+      recv_jit(t2, 1, cache_id="test2")
+    elif rank == 1:
+      send_jit(t2, 0, cache_id="test2")
+
+    # check that the received tensor is the same as the sent tensor
+    if rank == 0:
+      assert np.allclose(t.numpy(), t2.numpy()), f"{t2.numpy()} wasn't equal to {t.numpy()}"
+
+  print(f"rank {rank} passed")
+
+if __name__ == "__main__":
+  if getenv("HIP"):
+    devices = ["hip:0", "hip:1"]
+  else:
+    devices = ["gpu:0", "gpu:1" if not CI else "gpu:0"]
+  world_size = len(devices)
+
+  dist.init_oob(world_size)
+
+  processes = []
+  for rank, device in enumerate(devices):
+    processes.append(dist.spawn(rank, device, fn=run, args=()))
+  for p in processes: p.join()
+
+  # exit with error code if any of the processes failed
+  for p in processes:
+    if p.exitcode != 0: exit(p.exitcode)
--- a/tinygrad_repo/test/external/external_copy_benchmark.py
+++ b/tinygrad_repo/test/external/external_copy_benchmark.py
@@ -0,0 +1,27 @@
+import unittest
+from tinygrad.helpers import prod
+from tinygrad.ops import Device
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import GlobalCounters
+from tinygrad.jit import CacheCollector
+
+class TestCopy(unittest.TestCase):
+  def test_add1(self):
+    pts = []
+    for i in range(16384, 16384*256, 16384):
+      t = Tensor.randn(i).realize()
+      CacheCollector.start()
+      t.assign(t+1).realize()
+      fxn, args, _ = CacheCollector.finish()[0]
+      GlobalCounters.reset()
+      def run(): return fxn(args, force_wait=True)
+      ct = min([run() for _ in range(10)])
+      mb = prod(t.shape)*t.dtype.itemsize*2*1e-6
+      print(f"{mb*1e3:.2f} kB, {ct*1e3:.2f} ms, {mb/ct:.2f} MB/s")
+      pts.append((mb, mb/ct))
+    from matplotlib import pyplot as plt
+    plt.plot([x[0] for x in pts], [x[1] for x in pts])
+    plt.show()
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_llama_eval.py
+++ b/tinygrad_repo/test/external/external_llama_eval.py
@@ -0,0 +1,102 @@
+from lm_eval.base import BaseLM
+from lm_eval import evaluator, tasks
+import torch, json, argparse
+
+from examples.llama import LLaMa
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+
+class LLaMaAdaptor(BaseLM):
+  def __init__(
+    self,
+    model_size="7B",
+    model_gen=1,
+    device="",
+    quantize=False,
+    batch_size=1,
+    max_batch_size=1,
+    do_sample=False,
+    temperature=1.0,
+    checkpoint_path="",
+    tokenizer_path="",
+  ):
+    super().__init__()
+
+    if batch_size is None:
+      batch_size = 1
+    self.do_sample = do_sample
+    self.temperature = temperature
+    self._device = device
+
+    assert isinstance(model_gen, int)
+    assert isinstance(model_size, str)
+    assert isinstance(batch_size, int)
+    assert isinstance(checkpoint_path, str)
+    assert isinstance(tokenizer_path, str)
+
+    self.llama = LLaMa.build(checkpoint_path, tokenizer_path, model_gen, model_size, quantize)
+
+  @classmethod
+  def create_from_arg_string(cls, arg_string, additional_config=None):
+    kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")}
+    return cls(**kwargs, **additional_config)
+
+  @property
+  def eot_token_id(self):
+    # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+    return self.llama.tokenizer.eos_id()
+
+  @property
+  def max_length(self):
+    return 1024
+
+  @property
+  def max_gen_toks(self):
+    return 256
+
+  @property
+  def batch_size(self):
+    return 1
+
+  @property
+  def device(self):
+    return self._device
+
+  def tok_encode(self, string: str):
+    return [self.llama.tokenizer.bos_id()] + self.llama.tokenizer.encode(string)
+
+  def tok_decode(self, tokens):
+    return self.llama.tokenizer.decode(tokens)
+
+  def _model_call(self, inps):
+    Tensor.no_grad = True
+    return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy())
+
+  def greedy_until(self, requests):
+    continuations = []
+    for request in requests:
+      prompt, until = request[0], request[1]['until']
+      output = self.llama.greedy_until(prompt, until, max_length=128, temperature=0.0)
+      continuations.append(output[len(prompt):])
+    return continuations
+
+  def _model_generate(self, context, max_length, eos_token_id):
+    raise NotImplementedError()
+
+if __name__ == '__main__':
+  print(f"using {Device.DEFAULT} backend")
+
+  parser = argparse.ArgumentParser(description='Run LLaMA evals in tinygrad', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  parser.add_argument('--size', type=str, default="7B", help="Size of model to use [7B, 13B, 30B, 65B] for Gen 1, [7B, 13B] for Gen 2")
+  parser.add_argument('--gen', type=int, default="1", help="Generation of the model to use [1, 2]")
+  parser.add_argument('--quantize', action='store_true', help="Quantize the weights to int8 in memory")
+  parser.add_argument('--eval', type=str, default="arc_easy", help="Run in evaluation mode")
+  parser.add_argument('--limit', type=int, default=None, help="Limit tests in eval")
+  parser.add_argument('--weights', type=str, default="./weights/LLaMa/", help="Location of the weights")
+  parser.add_argument('--tokenizer', type=str, default="./weights/LLaMa/tokenizer.model", help="Location of the tokenizer")
+  args = parser.parse_args()
+
+  # run eval and exit
+  adaptor = LLaMaAdaptor(model_gen=args.gen, model_size=args.size, quantize=args.quantize, checkpoint_path=args.weights, tokenizer_path=args.tokenizer, device="cpu")
+  results = evaluator.evaluate(adaptor, tasks.get_task_dict(args.eval.split(",")), False, 0, args.limit)
+  print(json.dumps(results, indent=2))
--- a/tinygrad_repo/test/external/external_model_benchmark.py
+++ b/tinygrad_repo/test/external/external_model_benchmark.py
@@ -0,0 +1,128 @@
+import csv, pathlib, time, numpy as np
+from os import getenv
+import torch
+torch.set_num_threads(1)
+import onnx
+from onnx.helper import tensor_dtype_to_np_dtype
+import onnxruntime as ort
+from onnx2torch import convert
+from extra.utils import download_file
+from extra.onnx import get_run_onnx
+from tinygrad.helpers import OSX, DEBUG
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+
+MODELS = {
+  "resnet50": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
+  "openpilot": "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx",
+  "efficientnet": "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx",
+  "shufflenet": "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx",
+  "commavq": "https://huggingface.co/commaai/commavq-gpt2m/resolve/main/gpt2m.onnx",
+
+  # broken in torch MPS
+  #"zfnet": "https://github.com/onnx/models/raw/main/vision/classification/zfnet-512/model/zfnet512-9.onnx",
+  # TypeError: BatchNormalization() got an unexpected keyword argument 'is_test'
+  #"densenet": "https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-3.onnx",
+  # AssertionError: only onnx version >= 10 supported for slice
+  #"bert": "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-8.onnx",
+  # really slow
+  #"resnet18": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx",
+}
+
+CSV = {}
+open_csv = None
+torch.manual_seed(1)
+
+def benchmark(mnm, nm, fxn):
+  tms = []
+  for _ in range(3):
+    st = time.perf_counter_ns()
+    ret = fxn()
+    tms.append(time.perf_counter_ns() - st)
+  print(f"{mnm:15s} {nm:25s} {min(tms)*1e-6:7.2f} ms")
+  CSV[nm] = min(tms)*1e-6
+  return min(tms), ret
+
+#BASE = pathlib.Path(__file__).parents[2] / "weights" / "onnx"
+BASE = pathlib.Path("/tmp/onnx")
+def benchmark_model(m, validate_outs=False):
+  global open_csv, CSV
+  CSV = {"model": m}
+
+  fn = BASE / MODELS[m].split("/")[-1]
+  download_file(MODELS[m], fn)
+  onnx_model = onnx.load(fn)
+  output_names = [out.name for out in onnx_model.graph.output]
+  excluded = {inp.name for inp in onnx_model.graph.initializer}
+  input_shapes = {inp.name:tuple(x.dim_value if x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded}
+  input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input if inp.name not in excluded}
+  #input_types = {k:v if v!=np.float16 else np.float32 for k,v in input_types.items()}  # cast
+  np_inputs = {k:torch.randn(shp).numpy().astype(input_types[k]) for k,shp in input_shapes.items()}
+  assert len(input_shapes) < 30, f"too many input shapes {len(input_shapes)}"
+
+  # print input names
+  if DEBUG >= 2: print([inp.name for inp in onnx_model.graph.input if inp.name not in excluded])
+
+  for device in ["METAL" if OSX else "GPU", "CLANG"]: # + (["CUDA"] if torch.cuda.is_available() else []):
+    Device.DEFAULT = device
+    inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
+    tinygrad_model = get_run_onnx(onnx_model)
+    benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
+
+    from tinygrad.jit import TinyJit
+    tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
+    for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
+    benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()})
+    del inputs, tinygrad_model, tinygrad_jitted_model
+
+  try:
+    torch_model = convert(onnx_model)
+    torch_inputs = [torch.tensor(x) for x in np_inputs.values()]
+    benchmark(m, "torch_cpu", lambda: torch_model(*torch_inputs))
+
+    torch_device = "mps" if OSX else "cuda"
+    torch_mps_model = torch_model.to(torch_device)
+    torch_mps_inputs = [x.to(torch_device) for x in torch_inputs]
+    benchmark(m, f"torch_{torch_device}", lambda: torch_mps_model(*torch_mps_inputs))
+  except Exception as e: print(f"{m:16s}onnx2torch {type(e).__name__:>25}")
+
+  # bench onnxruntime
+  ort_options = ort.SessionOptions()
+  ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+  ort_options.log_severity_level = 3  # no warnings
+  for backend in ["CPU", "CUDA" if not OSX else "CoreML"]:  # https://onnxruntime.ai/docs/execution-providers/
+    provider = backend+"ExecutionProvider"
+    if provider not in ort.get_available_providers(): continue
+    ort_sess = ort.InferenceSession(str(fn), ort_options, [provider])
+    benchmark(m, f"onnxruntime_{backend.lower()}", lambda: ort_sess.run(output_names, np_inputs))
+    del ort_sess
+
+  if validate_outs:
+    rtol, atol = 2e-3, 2e-3  # tolerance for fp16 models
+    inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
+    tinygrad_model = get_run_onnx(onnx_model)
+    tinygrad_out = tinygrad_model(inputs)
+
+    ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
+    onnx_out = ort_sess.run(output_names, np_inputs)
+    onnx_out = dict([*[(name,x) for name, x in zip(output_names, onnx_out)]])
+
+    assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
+    print(f"{m:16s}outputs validated with rtol={rtol:.1e}, atol={atol:.1e}")
+
+  if open_csv is None:
+    open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))
+    open_csv.writeheader()
+  open_csv.writerow(CSV)
+
+def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5):
+  assert len(tiny_out) == len(onnx_out) and tiny_out.keys() == onnx_out.keys()
+  for k in tiny_out.keys():
+    tiny_v, onnx_v = tiny_out[k], onnx_out[k]
+    if tiny_v is None: assert tiny_v == onnx_v
+    else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}")
+
+if __name__ == "__main__":
+  if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), True)
+  else:
+    for m in MODELS: benchmark_model(m, True)
--- a/tinygrad_repo/test/external/external_multi_gpu.py
+++ b/tinygrad_repo/test/external/external_multi_gpu.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
+# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import colored
+from tinygrad.helpers import Timing
+from tinygrad.runtime.ops_gpu import CL
+
+# TODO: support multidevice in cuda
+device = 'gpu'
+
+if __name__ == "__main__":
+  sz = 1024*1024*256  # 1 GB
+  #sz = 1024*64
+
+  with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
+    c0 = Tensor.ones(sz, device="cpu").realize()
+    c1 = (Tensor.ones(sz, device="cpu")/2).realize()
+
+  with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    a0 = c0.to(f'{device}:0').realize()
+    CL.synchronize()
+  with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    b1 = c1.to(f'{device}:1').realize()
+    CL.synchronize()
+
+  # cross copy. this is going through the CPU
+  with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    a1 = a0.to(f'{device}:1').realize()
+    CL.synchronize()
+  with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    b0 = b1.to(f'{device}:0').realize()
+    CL.synchronize()
+
+  # sum
+  with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    ab0 = (a0 + b0).realize()
+    CL.synchronize()
+  with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    ab1 = (a1 + b1).realize()
+    CL.synchronize()
+
+  # cross device sum (does this work?)
+  # is this making a copy first? is that copy through the CPU?
+  # the slowness comes from the *blocking* clprg call, is this pyopencl?
+  with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    abx0 = (a0 + b1).realize()
+    CL.synchronize()
+
+  with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    abx1 = (b1 + a0).realize()
+    CL.synchronize()
+
+  # copy back
+  # NOTE: half of this slowness is caused by allocating memory on the CPU
+  with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    cc0 = ab0.numpy()
+  with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    cc1 = ab1.numpy()
+
+  # same
+  print("testing")
+  np.testing.assert_allclose(cc0, cc1)
+
+  # devices
+  print(ab0)
+  print(ab1)
+  print(abx0)
+  print(abx1)
--- a/tinygrad_repo/test/external/external_osx_profiling.py
+++ b/tinygrad_repo/test/external/external_osx_profiling.py
@@ -0,0 +1,41 @@
+from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
+from tinygrad.helpers import dtypes
+import time
+
+N = 1000000
+a = CLBuffer(N, dtypes.float32)
+b = CLBuffer(N, dtypes.float32)
+c = CLBuffer(N, dtypes.float32)
+
+prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
+  int idx = get_global_id(0);
+  a[idx] = b[idx] + c[idx];
+}""")
+prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
+t1 = time.monotonic_ns()
+e1 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
+CL.synchronize()
+t2 = time.monotonic_ns()
+time.sleep(3)
+t3 = time.monotonic_ns()
+e2 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
+CL.synchronize()
+t4 = time.monotonic_ns()
+
+print(e1.profile.queued)
+print(e1.profile.submit)
+print(e1.profile.start)
+print(e1.profile.end)
+
+print(e1, e2)
+print(t2-t1, e1.profile.end - e1.profile.start)
+print(t4-t3, e2.profile.end - e2.profile.start)
+print(t3-t2, e2.profile.queued-e1.profile.end)
+print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
+
+print("ratio since boot", t1/e1.profile.start)
+
+print(e1.profile.start)
+print(e1.profile.end)
+print(e2.profile.start)
+print(e2.profile.end)
--- a/tinygrad_repo/test/external/external_test_allocator_on_models.py
+++ b/tinygrad_repo/test/external/external_test_allocator_on_models.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+import unittest, gc
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.nn.state import get_state_dict
+from tinygrad.helpers import GlobalCounters
+from tinygrad.runtime.lib import RawBuffer, LRUAllocator
+from tinygrad.helpers import dtypes, prod
+from tinygrad.ops import Device
+from test.helpers import derandomize_model
+
+from examples.llama import Transformer
+
+ALLOCATED_DEV_BUFS = 0
+class FakeDeviceBuffer:
+  def __init__(self, sz, dt, device):
+    self.id = 1
+    self.size = sz
+    self.dtype = dt
+    self.device = device
+
+    global ALLOCATED_DEV_BUFS
+    ALLOCATED_DEV_BUFS += 1
+class FakeAllocator(LRUAllocator):
+  def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
+  def _do_free(self, buf):
+    buf.id -= 1
+    assert buf.id == 0, f"Free should be called once, but {buf.id}"
+  def __del__(self): # Fake allocator should clear all buffers after each test.
+    for v in self.cached_buffers.values():
+      for buf, _ in v: self._free_buffer(buf)
+
+FAKE_GLOBAL_ALLOCATOR = None
+class FakeBuffer(RawBuffer):
+  def __init__(self, size, dtype, device='0'):
+    global FAKE_GLOBAL_ALLOCATOR
+    super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
+    assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
+  @classmethod
+  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
+  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
+class FakeProgram:
+  def __init__(self, name:str, prg:str): pass
+  def __call__(self, *bufs, global_size, local_size, wait=False): pass
+
+def helper_test_correctness(gen, train):
+  from tinygrad.runtime.ops_gpu import CL, CLAllocator
+  old_alloc = CL.cl_allocator
+  CL.cl_allocator = CLAllocator(0)
+  no_alloc_result = train(*gen()).numpy()
+  Device[Device.DEFAULT].synchronize()
+  CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
+  for _ in range(4):
+    GlobalCounters.reset()
+    np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
+    Device[Device.DEFAULT].synchronize()
+  assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
+  CL.cl_allocator = old_alloc
+
+def __helper_test_alloc_count(gen, train):
+  was_alloc = ALLOCATED_DEV_BUFS
+  for _ in range(2):
+    train(*gen())
+  return ALLOCATED_DEV_BUFS - was_alloc
+
+def helper_test_alloc_count(mm, gen, train):
+  global FAKE_GLOBAL_ALLOCATOR
+  backup_program = Device[Device.DEFAULT].runtime
+  backup_buffer = Device[Device.DEFAULT].buffer
+  Device[Device.DEFAULT].runtime = FakeProgram
+  Device[Device.DEFAULT].buffer = FakeBuffer
+  Device[Device.DEFAULT].method_cache.clear()
+  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
+  new_allocs = __helper_test_alloc_count(gen, train)
+  Device[Device.DEFAULT].method_cache.clear()
+  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
+  old_allocs = __helper_test_alloc_count(gen, train)
+  print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
+  assert new_allocs < old_allocs, f"Hmm, doesn't cache work any more?"
+  Device[Device.DEFAULT].runtime = backup_program
+  Device[Device.DEFAULT].buffer = backup_buffer
+  FAKE_GLOBAL_ALLOCATOR = None
+
+def check_gc():
+  if Device.DEFAULT == "GPU":
+    gc.collect() # Need to collect Tensors.
+    from extra.introspection import print_objects
+    assert print_objects() == 0
+
+class TestAllocators(unittest.TestCase):
+  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+  def test_lru_allocator_tiny_llama(self):
+    old_type = Tensor.default_type
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    def __test():
+      model = Transformer(**args_tiny)
+      derandomize_model(model)
+      def test(t): return model(t, 0).realize()
+      helper_test_correctness(lambda: (Tensor([[1,]]),), test)
+    __test()
+    Tensor.default_type = old_type
+    check_gc()
+
+  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+  def test_lru_allocator_tiny_llama_alloc_counts(self):
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    def test_alloc_count(t):
+      model = Transformer(**args_tiny)
+      for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
+      return model(t, 0).realize()
+    helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
+    check_gc()
+
+  @unittest.skip("huge for CI")
+  def test_stable_diffusion(self):
+    from examples.stable_diffusion import UNetModel
+    model = UNetModel()
+    derandomize_model(model)
+    def test(t, t2): return model(t, 801, t2).realize()
+    helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_embedding.py
+++ b/tinygrad_repo/test/external/external_test_embedding.py
@@ -0,0 +1,8 @@
+from tinygrad.tensor import Tensor
+from tinygrad.nn import Embedding
+
+if __name__ == "__main__":
+  vocab_size = 50257
+  dim = 128
+  test = Embedding(vocab_size, dim)
+  ret = test(Tensor([[1,2,3]])).numpy()
--- a/tinygrad_repo/test/external/external_test_gpu_ast.py
+++ b/tinygrad_repo/test/external/external_test_gpu_ast.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps
+from tinygrad.shape.shapetracker import ShapeTracker, View, ZeroView
+from tinygrad.runtime.ops_gpu import GPUBuffer, CLProgram, CLCodegen
+#from tinygrad.runtime.ops_metal import MetalBuffer as GPUBuffer, MetalProgram as CLProgram, MetalCodegen as CLCodegen
+from tinygrad.helpers import getenv
+from extra.lib_test_ast import test_ast
+
+import platform
+OSX = platform.system() == "Darwin"
+
+def compile_and_test_ast(ast, local_size=None):
+  k = CLCodegen(ast)
+  prg = k.codegen().build(CLProgram)
+  if local_size is not None: prg.local_size = local_size
+  for i in range(5): prg(prg.lower(k.bufs))
+  if getenv("TEST", 0): test_ast(k)
+
+class TestAST(unittest.TestCase):
+  def test_conv_zeroview_ast(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (2, 2, 2, 1), -3), ZeroView((1, 1, 1, 2), ((0, 1), (0, 1), (-1, 2), (-1, 3))), View((1, 1, 3, 4), (0, 0, 4, 1), 0)]))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (0, 0, 0, 0), 0)]))
+    op1 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    ast = LazyOp(UnaryOps.RELU, (op1,), None)
+    compile_and_test_ast(ast)
+
+  def test_cifar_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 64, 34, 34), (65536, 1024, 32, 1), -33), ZeroView((512, 64, 32, 32), ((0, 512), (0, 64), (-1, 33), (-1, 33))), View((512, 1, 128, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 1, 128, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 64, 3, 3), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    ast = LazyOp(ReduceOps.SUM, (op0,), (512, 1, 128, 32, 32, 1, 1, 1))
+    compile_and_test_ast(ast)
+
+  def test_cifar_conv_backward(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 512, 10, 10), (64, 16384, 8, 1), -9), ZeroView((256, 512, 8, 8), ((0, 256), (0, 512), (-1, 9), (-1, 9))), View((256, 1, 512, 3, 3, 512, 8, 8), (51200, 51200, 0, 10, 1, 100, 10, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 256, 8, 8), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 1, 512, 3, 3, 512, 8, 8), (0, 0, 64, 0, 0, 32768, 8, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 512, 8, 8), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (256, 1, 512, 3, 3, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (256, 512, 3, 3))
+    compile_and_test_ast(ast)
+
+  def test_first_op_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 130, 258, 1, 12), (393216, 3072, 12, 12, 1), -3084), ZeroView((1, 128, 256, 1, 12), ((0, 1), (-1, 129), (-1, 257), (0, 1), (0, 12))), View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 6192, 24, 0, 0, 3096, 12, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 768, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 0, 0, 432, 4, 144, 16, 48, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 108, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    op3 = LazyOp(UnaryOps.RELU, (op2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    op4 = LazyOp(UnaryOps.EXP, (op2,), None)
+    op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
+    op6 = LazyOp(UnaryOps.RELU, (op5,), None)
+    op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
+    op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
+    compile_and_test_ast(ast)
+
+  def test_second_op_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 66, 130, 32, 1), (262144, 4096, 32, 1, 1), -4128), ZeroView((1, 64, 128, 32, 1), ((0, 1), (-1, 65), (-1, 129), (0, 32), (0, 1))), View((1, 64, 128, 8, 4, 1, 1, 3, 3), (266240, 4160, 32, 4, 1, 12480, 12480, 4160, 32), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 64, 128, 8, 4, 1, 1, 3, 3), (0, 0, 0, 36, 1, 0, 0, 12, 4), 0)]), hostbuf=GPUBuffer(shape=(8, 9, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    op3 = LazyOp(UnaryOps.RELU, (op2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    op4 = LazyOp(UnaryOps.EXP, (op2,), None)
+    op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
+    op6 = LazyOp(UnaryOps.RELU, (op5,), None)
+    op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
+    op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
+    compile_and_test_ast(ast)
+
+  def test_third_op_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4))
+    compile_and_test_ast(ast)
+
+  # VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op
+  # 164 time 27.75 ms running re_S128_4            with [128]           None            count  4 runtime 1016.06 us      2.07 GFLOPS () -> (128, 1)
+  # 169 time 22.51 ms running matmul               with [4, 16, 128]    [4, 16, 16]     count  5 runtime  110.08 us     19.06 GFLOPS ('-DMATMUL',) -> (128, 1)
+  def test_reduce_op(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 128, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 128, 4, 1, 1, 1, 1), (512, 512, 512, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    op3 = LazyOp(UnaryOps.RELU, (op2,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
+    compile_and_test_ast(ast)
+
+  def test_alt_reduce_op(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
+    op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
+    compile_and_test_ast(ast)
+
+  # re_S32_16_36_6 is fast
+  def test_1x1_36_6(self):  # 36 <- 6
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 1536, 24, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 0, 0, 0, 0, 96, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(36, 24, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(144,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
+    op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    op4 = LazyOp(UnaryOps.EXP, (op2,), None)
+    op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
+    buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
+    op6 = LazyOp(BinaryOps.MAX, (op5,buf5,), None)
+    op7 = LazyOp(BinaryOps.SUB, (op3,op6,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op7,), (32, 2304, 4))
+    compile_and_test_ast(ast, None if OSX else (16, 16, 4))
+
+  # re_S32_16_6_36 is slow
+  def test_1x1_6_36(self):  # 6 <- 36
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 9216, 144, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 2304, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 0, 0, 0, 0, 576, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 144, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 1536, 24, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
+    op3 = LazyOp(BinaryOps.ADD, (op2,buf3,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op3,), (32, 384, 4))
+    compile_and_test_ast(ast, (6, 16, 4))
+
+  # re_S32_16_6_24
+  def test_1x1_6_24(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 6144, 96, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 1536, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 0, 0, 0, 0, 384, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 96, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
+    #buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
+    #op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 384, 4))
+    compile_and_test_ast(ast, (6, 4, 8))
+
+  def test_full_reduce_op(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (512, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
+    op0 = LazyOp(BinaryOps.ADD, (buf0,buf1,), None)
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([2.], dtype=np.float32)))
+    op1 = LazyOp(BinaryOps.POW, (op0,buf2,), None)
+    op2 = LazyOp(ReduceOps.SUM, (op1,), (1, 1))
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([0.5], dtype=np.float32)))
+    op3 = LazyOp(BinaryOps.POW, (op2,buf3,), None)
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
+    op4 = LazyOp(BinaryOps.SUB, (op3,buf4,), None)
+    op5 = LazyOp(UnaryOps.RELU, (op4,), None)
+    buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
+    op6 = LazyOp(BinaryOps.ADD, (op5,buf5,), None)
+    buf6 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([3.4e+38], dtype=np.float32)))
+    op7 = LazyOp(BinaryOps.SUB, (op3,buf6,), None)
+    op8 = LazyOp(UnaryOps.RELU, (op7,), None)
+    op9 = LazyOp(BinaryOps.SUB, (op6,op8,), None)
+    op10 = LazyOp(UnaryOps.RECIPROCAL, (op9,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op10,), (1, 1))
+    compile_and_test_ast(ast)
+
+  def test_1239_reduce(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 64, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 1024, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(1239, 256,
+    4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1239, 4, 1, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 1, 1, 1, 4956))
+    compile_and_test_ast(ast)
+
+  def test_enet_first_conv_bs32(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 1, 32, 112, 112, 3, 3, 3), (0, 0, 27, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 3, 3), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (8, 1, 32, 112, 112, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (8, 32, 112, 112))
+    compile_and_test_ast(ast)
+
+  def test_enet_reduce_bs32(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 1, 32, 3, 3, 32, 112, 112), (0, 12845056, 401408, 0, 0, 12544, 112, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 32, 1, 1, 32, 112, 112), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (3, 1, 32, 3, 3, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (3, 32, 3, 3))
+    compile_and_test_ast(ast)
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_image.py
+++ b/tinygrad_repo/test/external/external_test_image.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+import os
+import unittest
+import numpy as np
+if 'IMAGE' not in os.environ:
+  os.environ['IMAGE'] = '2'
+os.environ['GPU'] = '1'
+os.environ['OPT'] = '2'
+from tinygrad.tensor import Tensor
+from tinygrad.nn import Conv2d
+Tensor.no_grad = True
+
+class TestImage(unittest.TestCase):
+  def test_create_image(self):
+    t = Tensor.ones(128, 128, 1)
+    t = t.reshape(128, 32, 4) + 3
+    t.realize()
+    np.testing.assert_array_equal(t.numpy(), np.ones((128,32,4))*4)
+
+  def test_sum_image(self):
+    t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
+    t1.realize()
+    t1 = t1.sum()
+    t1.realize()
+    assert t1.numpy() == 16*4*4*4, f"got {t1.numpy()}"
+
+  def test_add_image(self):
+    t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
+    t2 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 4
+    t1.realize()
+    t2.realize()
+    t3 = t1 + t2
+    t3.realize()
+    np.testing.assert_array_equal(t3.numpy(), np.ones((16,4,4))*9)
+
+  def test_padded_conv(self):
+    bs, in_chans, out_chans = 1,12,32
+    tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
+    tiny_dat = Tensor.ones(bs, 12, 64, 128)
+    tiny_conv(tiny_dat).realize()
+
+  def test_op_conv(self):
+    bs, in_chans, out_chans = 1,12,32
+    tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
+    tiny_dconv = Conv2d(out_chans, out_chans, 1, bias=None, padding=0)
+    tiny_dat = Tensor.ones(bs, 12, 64, 128)
+    p2 = tiny_conv(tiny_dat).relu()
+    p2 = tiny_dconv(p2)
+    p2.realize()
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_jit_on_models.py
+++ b/tinygrad_repo/test/external/external_test_jit_on_models.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
+from tinygrad.helpers import dtypes, CI
+from tinygrad.ops import Device
+from test.helpers import derandomize_model
+
+from examples.llama import Transformer
+
+def helper_test_jitted_correctness(gen, train, train_jit):
+  nojit = train(*gen()).numpy()
+  for _ in range(5): jit = train_jit(*gen()).numpy()
+  np.testing.assert_allclose(nojit, jit, rtol=1e-3, atol=1e-5)
+
+@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE, "needs JIT")
+class TestJittedModels(unittest.TestCase):
+  def test_jitted_tiny_llama(self):
+    old_type = Tensor.default_type
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    model = Transformer(**args_tiny)
+    derandomize_model(model)
+    def test(t): return model(t, 0).realize()
+
+    @TinyJit
+    def test_jit(t): return model(t, 0).realize()
+    helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
+    Tensor.default_type = old_type
+
+  @unittest.skipUnless(not CI, "huge for CI")
+  def test_jitted_stable_diffusion(self):
+    from examples.stable_diffusion import UNetModel
+    model = UNetModel()
+    derandomize_model(model)
+    def test(t, t2): return model(t, 801, t2).realize()
+
+    @TinyJit
+    def test_jit(t, t2): return model(t, 801, t2).realize()
+    helper_test_jitted_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test, test_jit)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_onnx_backend.py
+++ b/tinygrad_repo/test/external/external_test_onnx_backend.py
@@ -0,0 +1,208 @@
+import unittest
+from onnx.backend.base import Backend, BackendRep
+import onnx.backend.test
+import numpy as np
+from tinygrad.tensor import Tensor
+from typing import Any, Tuple
+from tinygrad.helpers import getenv, CI
+
+# pip3 install tabulate
+pytest_plugins = 'onnx.backend.test.report',
+
+from extra.onnx import get_run_onnx
+
+class TinygradModel(BackendRep):
+  def __init__(self, run_onnx, input_names):
+    super().__init__()
+    self.fxn = run_onnx
+    self.input_names = input_names
+
+  def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
+    real_inputs = {k:v for k,v in zip(self.input_names, inputs)}
+    ret = self.fxn(real_inputs, debug=True)
+    return tuple(x.numpy() if isinstance(x, Tensor) else [i.numpy() for i in x] if isinstance(x, list) else np.array(x) for x in ret.values())
+
+class TinygradBackend(Backend):
+  @classmethod
+  def prepare(cls, model, device):
+    input_all = [x.name for x in model.graph.input]
+    input_initializer = [x.name for x in model.graph.initializer]
+    net_feed_input = [x for x in input_all if x not in input_initializer]
+    print("prepare", cls, device, net_feed_input)
+    run_onnx = get_run_onnx(model)
+    return TinygradModel(run_onnx, net_feed_input)
+
+  @classmethod
+  def supports_device(cls, device: str) -> bool:
+    return device == "CPU"
+
+backend_test = onnx.backend.test.BackendTest(TinygradBackend, __name__)
+
+# no support for reduce with multiply (needs llop)
+backend_test.exclude('test_reduce_prod_*')
+
+# TODO figure out why it's returning wrong values, geohotstan's uneducated guess is it's due to imprecision from float64 (double) -> float32
+# see Type Constraints: https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adam.html#type-constraints
+backend_test.exclude('test_adam_multiple_cpu')
+backend_test.exclude('test_nesterov_momentum_cpu')
+
+# we only support float32
+backend_test.exclude('uint8')
+backend_test.exclude('uint16')
+backend_test.exclude('uint32')
+backend_test.exclude('uint64')
+backend_test.exclude('int8')
+backend_test.exclude('int16')
+backend_test.exclude('float64')
+backend_test.exclude('string')
+
+backend_test.exclude('test_pow_types_int*')
+backend_test.exclude('test_cast_*')
+backend_test.exclude('test_castlike_*')
+backend_test.exclude('test_convinteger_*')
+backend_test.exclude('test_matmulinteger_*')
+
+backend_test.exclude('test_reduce_log_sum_exp*') # dependent on actual float64 implementation for backends
+backend_test.exclude('test_operator_add*') # dependent on float64 math. Without it values default to 0 or inf
+
+# we don't support indexes
+# backend_test.exclude('test_argmax_*') # Needs more work: select_last_index
+# backend_test.exclude('test_argmin_*') # Needs more work: select_last_index
+backend_test.exclude('test_nonzero_*')
+
+# no support for mod
+backend_test.exclude('test_mod_*')
+
+# no boolean ops (2d, 3d, 4d)
+backend_test.exclude('test_bitshift_*')
+
+# no scatternd gathernd
+backend_test.exclude('test_gathernd_*')
+backend_test.exclude('test_scatternd_*')
+
+# no quantize
+backend_test.exclude('test_dynamicquantizelinear_*')
+backend_test.exclude('test_qlinearmatmul_*')
+backend_test.exclude('test_qlinearconv_*')
+backend_test.exclude('test_quantizelinear_*')
+
+# no rnn
+backend_test.exclude('test_gru_*')
+backend_test.exclude('test_rnn_*')
+backend_test.exclude('test_lstm_*')
+backend_test.exclude('test_simple_rnn_*')
+
+# no control flow
+backend_test.exclude('test_if_*')
+backend_test.exclude('test_loop*')
+backend_test.exclude('test_range_float_type_positive_delta_expanded_cpu') # requires loop
+
+# unsupported (strange) ops
+backend_test.exclude('test_bitwise_*')
+backend_test.exclude('test_blackmanwindow_*')
+backend_test.exclude('test_bernoulli_*')
+backend_test.exclude('test_cumsum_*')
+backend_test.exclude('test_det_*')
+
+backend_test.exclude('test_tril_zero_cpu') # TODO: zero array support
+backend_test.exclude('test_triu_zero_cpu') # TODO: zero array support
+
+backend_test.exclude('test_col2im_*')
+backend_test.exclude('test_hammingwindow_*')
+backend_test.exclude('test_hannwindow_*')
+backend_test.exclude('test_hardmax_*')
+backend_test.exclude('test_gridsample_*')
+backend_test.exclude('test_dft_*')
+backend_test.exclude('test_einsum_*')
+backend_test.exclude('test_strnorm_*')
+backend_test.exclude('test_unique_*')
+backend_test.exclude('test_sequence_*')
+backend_test.exclude('test_nonmaxsuppression_*')
+backend_test.exclude('test_reversesequence_*')
+backend_test.exclude('test_roialign_*')
+backend_test.exclude('test_top_k_*')
+backend_test.exclude('test_tfidfvectorizer_*')
+backend_test.exclude('test_stft_*')
+backend_test.exclude('test_melweightmatrix_*')
+
+# more strange ops
+backend_test.exclude('test_basic_deform_conv_*')
+backend_test.exclude('test_deform_conv_*')
+backend_test.exclude('test_lppool_*')
+backend_test.exclude('test_depthtospace_*')
+backend_test.exclude('test_spacetodepth_*')
+backend_test.exclude('test_scan*')
+backend_test.exclude('test_split_to_sequence_*')
+backend_test.exclude('test_resize_downsample_scales_cubic_*') # unsure how to implement cubic
+backend_test.exclude('test_resize_downsample_sizes_cubic_*') # unsure how to implement cubic
+backend_test.exclude('test_resize_upsample_scales_cubic_*') # unsure how to implement cubic
+backend_test.exclude('test_resize_upsample_sizes_cubic_*') # unsure how to implement cubic
+
+# rest of the failing tests
+backend_test.exclude('test_averagepool_2d_dilations_cpu') # dilations != 1 not supported for avgpool
+backend_test.exclude('test_convtranspose_autopad_same_cpu') # TODO geohotstan has no idea how this is done, autopad requires output_shape but output_shape requires pads from autopad
+backend_test.exclude('test_optional_has_element_empty_optional_input_cpu') # Attempts to create Tensor from None
+backend_test.exclude('test_range_int32_type_negative_delta_expanded_cpu') # AttributeProto.GRAPH not implemented
+backend_test.exclude('test_reshape_allowzero_reordered_cpu') # reshaping to 0 shape
+backend_test.exclude('test_resize_downsample_scales_linear_antialias_cpu') # antialias not implemented
+backend_test.exclude('test_resize_downsample_sizes_linear_antialias_cpu') # antialias not implemented
+backend_test.exclude('test_resize_tf_crop_and_resize_cpu') # unsure about fill value after clip
+backend_test.exclude('test_operator_addconstant_cpu') # bad data type
+
+# issue 1556 https://github.com/tinygrad/tinygrad/issues/1556
+backend_test.exclude('test_isinf_cpu')
+backend_test.exclude('test_isinf_negative_cpu')
+backend_test.exclude('test_isinf_positive_cpu')
+backend_test.exclude('test_isnan_cpu')
+
+# issue 1791 fast math messes with these https://github.com/tinygrad/tinygrad/issues/1791
+backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
+backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
+backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
+
+# issue 2067 potentially also a fastmath issue https://github.com/tinygrad/tinygrad/issues/2067
+if getenv('METAL'):
+  backend_test.exclude('test_maxpool_2d_pads_cpu')
+  backend_test.exclude('test_maxpool_2d_same_lower_cpu')
+
+# Don't know how to treat special TensorProto like TensorProto.FLOAT8E4M3FN
+if getenv("CPU") or getenv("TORCH"):
+  backend_test.exclude('test_dequantizelinear_axis_cpu')
+  backend_test.exclude('test_dequantizelinear_cpu')
+
+# compiled backends cannot reshape to and from 0
+if getenv('LLVM') or getenv('GPU') or getenv('CLANG') or getenv('METAL') or getenv('CUDA'):
+  backend_test.exclude('test_slice_start_out_of_bounds_cpu')
+  backend_test.exclude('test_constantofshape_int_shape_zero_cpu')
+
+if getenv('GPU') or getenv('METAL'):
+  backend_test.exclude('test_mish_cpu') # weird inaccuracy
+  backend_test.exclude('test_mish_expanded_cpu') # weird inaccuracy
+  backend_test.exclude('test_eyelike_with_dtype_cpu') # backend does not support dtype: Double
+
+# Segfaults in CI
+if (getenv('LLVM') or getenv('CUDA')) and CI:
+  backend_test.exclude('test_max_float16_cpu')
+  backend_test.exclude('test_min_float16_cpu')
+
+# disable model tests for now since they are slow
+if not getenv("MODELTESTS"):
+  for x in backend_test.test_suite:
+    if 'OnnxBackendRealModelTest' in str(type(x)):
+      backend_test.exclude(str(x).split(" ")[0])
+else:
+  # model tests all pass!
+  backend_test.include('test_resnet50')
+  backend_test.include('test_inception_v1')
+  backend_test.include('test_inception_v2')
+  backend_test.include('test_densenet121')
+  backend_test.include('test_shufflenet')
+  backend_test.include('test_squeezenet')
+  backend_test.include('test_bvlc_alexnet')
+  backend_test.include('test_zfnet512')
+  backend_test.include('test_vgg19')
+
+globals().update(backend_test.enable_report().test_cases)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_opt.py
+++ b/tinygrad_repo/test/external/external_test_opt.py
@@ -0,0 +1,392 @@
+#!/usr/bin/env python
+import os
+
+import torch
+if "OPT" not in os.environ:
+  os.environ["OPT"] = "2"
+
+import gc
+import numpy as np
+
+import unittest
+from tinygrad.tensor import Tensor, Device
+from tinygrad import nn
+from tinygrad.helpers import getenv
+from tinygrad.nn import optim
+from tinygrad.helpers import GlobalCounters
+from tinygrad.lazy import PUSH_PERMUTES
+from tinygrad.jit import CacheCollector
+
+class CLCache:
+  def __init__(self, allowed=None, strict=False, preclear=True): self.allowed, self.strict, self.preclear = allowed, strict, preclear
+  def __enter__(self):
+    if self.preclear:
+      gc.collect()
+      for x in [x for x in gc.get_objects() if isinstance(x, Tensor)]:
+        x.realize()
+      GlobalCounters.reset()
+    CacheCollector.start()
+    print("cache: entering")
+  def __exit__(self, type, value, traceback):
+    cache = CacheCollector.finish()
+    print(f"cache: exiting with size {len(cache)}", f"allowed {self.allowed}" if self.allowed is not None else "")
+    if self.allowed is not None:
+      assert len(cache) <= self.allowed and (not self.strict or len(cache) == self.allowed), f"used too many kernels! {len(cache)} > {self.allowed}"
+
+from models.convnext import ConvNeXt
+from models.efficientnet import EfficientNet
+from models.resnet import ResNet18
+from models.vit import ViT
+from tinygrad.nn.state import get_parameters
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestInferenceMinKernels(unittest.TestCase):
+  def setUp(self):
+    Tensor.training = False
+
+  @unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
+  def test_convnext(self):
+    model = ConvNeXt()
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(129):
+      model(img).realize()
+
+  def test_enet(self):
+    model = EfficientNet(getenv("ENET_NUM", 0), has_se=False)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(51):
+      model.forward(img).realize()
+
+  def test_enet_se(self):
+    model = EfficientNet(getenv("ENET_NUM", 0), has_se=True)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    # TODO: this seems very high
+    with CLCache(115):
+      model.forward(img).realize()
+
+  def test_resnet(self):
+    model = ResNet18()
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(26):
+      model.forward(img).realize()
+
+  def test_vit(self):
+    model = ViT(embed_dim=192, num_heads=3)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(222): # NOTE: this is way too high
+      out = model.forward(img)
+      assert len(CacheCollector.cache) == 0, "ViT prerealized?"
+      out.realize()
+
+  def test_llama(self):
+    from examples.llama import Transformer
+    args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
+    model = Transformer(**args_tiny)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    with CLCache(85):
+      model(Tensor([[1,2,3,4]]), 0).realize()
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOptBinOp(unittest.TestCase):
+  def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      c = f1(a, b)
+      if f2 is not None: d = f2(a, b)
+      c.realize()
+      if f2 is not None: d.realize()
+      assert len(CacheCollector.cache) == allowed, "binop was rerun!"
+    if f2 is not None: np.testing.assert_allclose(c.numpy().ravel(), d.numpy().ravel(), rtol=1e-3, atol=1e-5)
+
+  def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
+  def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
+  def test_no_binop_rerun_reduce_broadcast(self): return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
+  @unittest.skip("this test started failing with the new change, based movementop issue")
+  def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
+  def test_no_binop_rerun_mid_reshape(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(256)+a.reshape(256))
+
+  # currently non working tests
+  #def test_no_binop_rerun_preshape(self): return self._test_no_binop_rerun(lambda a,b: a.reshape(16, 16, 1)*b.reshape(16, 16, 1), lambda a,b: a*b)
+  #def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
+  #def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOptReduceLoop(unittest.TestCase):
+  @unittest.skip("this is broken")
+  def test_loop_left(self):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      t = a.sum(0)
+      b = t.reshape(16,1).expand(16,16).sum(0)
+      c = (t+b)
+      c.realize()
+      assert len(CacheCollector.cache) == 2, "loop left fusion broken"
+
+  def test_loop_right(self):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      t = a.sum(0)
+      b = t.reshape(16,1).expand(16,16).sum(0)
+      c = (b+t)
+      c.realize()
+      assert len(CacheCollector.cache) == 2, "loop right fusion broken"
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOptWChild(unittest.TestCase):
+  def test_unrealized_child(self):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      c = (a*b).sum()
+      d = c+1
+      e = c+2
+      d.realize()
+      assert len(CacheCollector.cache) == 2, "don't fuse if you have children"
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOpt(unittest.TestCase):
+  def test_muladd(self):
+    a,b,c = [Tensor.ones(2,2) for _ in range(3)]
+    with CLCache():
+      d = a * b + c
+      d.realize()
+      assert len(CacheCollector.cache) == 1, "optimizer didn't fold muladd"
+    np.testing.assert_allclose(d.numpy(), np.ones((2,2))*2, rtol=1e-5)
+
+  def test_fold_reduce_elementwise(self):
+    img = Tensor.ones(32)
+    addme = Tensor.ones(1)
+    with CLCache():
+      ret = img.sum() + addme
+      ret.realize()
+      assert len(CacheCollector.cache) == 1, "optimizer didn't fold reduce/elementwise"
+    assert ret.numpy()[0] == 33
+
+  def test_fold_batchnorm(self):
+    with Tensor.train():
+      img = Tensor.ones(1,32,4,4)
+      bn = nn.BatchNorm2d(32, track_running_stats=False)
+      with CLCache():
+        img_bn = bn(img).realize()
+        print(img_bn)
+        assert len(CacheCollector.cache) == 3, f"optimizer didn't fold batchnorm, got {len(CacheCollector.cache)}"
+    # Tensor.training = False
+
+  def test_fold_conv_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(2,3,4,4)
+      c1 = nn.Conv2d(3,32,3)
+      opt = optim.SGD(get_parameters(c1))
+      with CLCache():
+        opt.zero_grad()
+        c1(img).relu().sum().backward()
+        opt.step()
+        # TODO: this should be 4, but the sum output child stays around
+        # with pushing_permutes it can be 3
+        # TODO: broken with optim fixes
+        assert len(CacheCollector.cache) in [4,5,6], f"optimizer didn't fold conv-backward SGD, got {len(CacheCollector.cache)}"
+    # Tensor.training = False
+
+  def test_fold_2convs_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(2,3,64,64)
+      c1 = nn.Conv2d(3,16,3,bias=False)
+      c2 = nn.Conv2d(16,32,3,bias=False)
+      opt = optim.SGD(get_parameters([c1, c2]))
+      with CLCache(allowed=9):
+        opt.zero_grad()
+        c2(c1(img).relu()).relu().sum().backward()
+        opt.step()
+    # Tensor.training = False
+
+  def test_fold_4convs_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(2,3,64,64)
+      c1 = nn.Conv2d(3,4,3,bias=False)
+      c2 = nn.Conv2d(4,8,3,bias=False)
+      c3 = nn.Conv2d(8,16,3,bias=False)
+      c4 = nn.Conv2d(16,32,3,bias=False)
+      opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
+      with CLCache(allowed=19):
+        opt.zero_grad()
+        c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
+        opt.step()
+    # Tensor.training = False
+
+  def test_fold_conv_batchnorm_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(1,3,4,4)
+      c1 = nn.Conv2d(3,32,3)
+      bn = nn.BatchNorm2d(32, track_running_stats=False)
+      opt = optim.SGD(get_parameters([c1, bn]))
+      with CLCache(allowed=18): # this is too high
+        img_bn = bn(c1(img)).elu().sum()
+        opt.zero_grad()
+        img_bn.backward()
+        opt.step()
+    # Tensor.training = False
+
+  def test_fold_conv_batchnorm_notrain(self):
+    img = Tensor.ones(1,3,8,8)
+    c1 = nn.Conv2d(3,32,3)
+    bn = nn.BatchNorm2d(32, track_running_stats=False)
+    # precache the bn
+    img_conv = bn(c1(img)).relu().realize()
+    with CLCache():
+      img_conv = bn(c1(img)).relu().realize()
+      assert len(CacheCollector.cache) == 1, f"optimizer didn't fold conv-batchnorm at test time, got {len(CacheCollector.cache)}"
+
+  def test_fold_conv_batchnorm(self):
+    with Tensor.train():
+      img = Tensor.ones(1,3,8,8)
+      c1 = nn.Conv2d(3,32,3)
+      bn = nn.BatchNorm2d(32, track_running_stats=False)
+      with CLCache():
+        img_conv = bn(c1(img)).relu().realize()
+        print(img_conv)
+        assert len(CacheCollector.cache) == 4, f"optimizer didn't fold conv-batchnorm, got {len(CacheCollector.cache)}"
+
+  def test_fold_conv_elu(self):
+    img = Tensor.ones(1,4,8,8)
+    c1 = nn.Conv2d(4, 4, kernel_size=3)
+    c2 = nn.Conv2d(4, 4, kernel_size=3)
+    with CLCache():
+      img_conv = img.sequential([c1, Tensor.elu, c2, Tensor.elu]).realize()
+      print(img_conv)
+      assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/elu"
+
+  def test_fold_conv_relu(self):
+    img = Tensor.ones(1,4,8,8)
+    c1 = nn.Conv2d(4, 4, kernel_size=3)
+    c2 = nn.Conv2d(4, 4, kernel_size=3)
+    with CLCache():
+      img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
+      print(img_conv)
+      assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
+
+  def test_fold_conv_relu_nobias(self):
+    img = Tensor.ones(1,4,8,8)
+    c1 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
+    c2 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
+    with CLCache():
+      img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
+      print(img_conv)
+      assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
+
+  def test_permute_was_pushed(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2)
+      d = c.permute(1,0).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  def test_permute_was_pushed_through_contract_reshape(self):
+    a = Tensor.randn(4, 4, 4, 4, 4)
+    with CLCache():
+      c = a.sum(-1)
+      d = c.reshape(16,16).permute(1,0).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,16).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  def test_permute_was_pushed_through_contractw1s_reshape(self):
+    a = Tensor.randn(4, 4, 4, 4, 4)
+    with CLCache():
+      c = a.sum(-1)
+      d = c.reshape(16,1,16).permute(2,1,0).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,1,16).transpose(2,1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  # TODO: push permute through expansion reshape
+  @unittest.skip("expansion can't push expand permute yet")
+  @unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
+  def test_permute_was_pushed_through_expand_reshape(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2)
+      d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0).reshape(4,4,4,4), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  @unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
+  def test_no_reduceop_rerun(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2)
+      d = a.sum(2).permute(1,0)
+      c.realize()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(c.numpy().transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    assert cache_len == 1, "reduceop was rerun!"
+
+  @unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
+  def test_no_reduceop_rerun_alt(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2).permute(1,0)
+      d = a.sum(2)
+      c.realize()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(c.numpy(), d.numpy().transpose(1,0), rtol=1e-3, atol=1e-5)
+    assert cache_len == 1, "reduceop was rerun!"
+
+  def test_fold_with_contiguous(self):
+    a = Tensor.randn(16, 16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      c = (a.sum(2).contiguous() + b).contiguous()
+      c.realize()
+      cache_len = len(CacheCollector.cache)
+    assert cache_len == 1, "contiguous wasn't folded"
+
+  def _test_fold_expand_reduce_helper(self, n, m, axis, allowed):
+    b = torch.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
+    with CLCache(allowed=allowed):
+      a = Tensor.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
+      a.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
+    return cache_len
+
+  def test_expand_reduce_is_folded_on_same_axis(self):
+    for axis in [0, 1]:
+      for n in [4, 8, 16]:
+        b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
+        with CLCache(allowed=2):
+          a = Tensor.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
+          a.realize()
+          cache_len = len(CacheCollector.cache)
+        np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
+        return cache_len
+
+  def test_expand_reduce_is_not_folded_on_different_axes(self):
+    axis1, axis2 = 0, 1
+    for n in [4, 8, 16]:
+      b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
+      with CLCache(allowed=3):
+        a = Tensor.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
+        a.realize()
+        cache_len = len(CacheCollector.cache)
+      np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
+      return cache_len
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_optim.py
+++ b/tinygrad_repo/test/external/external_test_optim.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+import tensorflow as tf
+import tensorflow_addons as tfa
+from tinygrad.tensor import Tensor
+from tinygrad.nn.optim import LAMB
+
+np.random.seed(1337)
+x_init = np.random.randn(1,4).astype(np.float32)
+W_init = np.random.randn(4,4).astype(np.float32)
+m_init = np.random.randn(1,4).astype(np.float32)
+
+class TinyNet:
+  def __init__(self):
+    self.x = Tensor(x_init.copy(), requires_grad=True)
+    self.W = Tensor(W_init.copy(), requires_grad=True)
+    self.m = Tensor(m_init.copy())
+
+  def forward(self):
+    out = self.x.matmul(self.W).relu()
+    out = out.log_softmax(1)
+    out = out.mul(self.m).add(self.m).sum()
+    return out
+
+class TinyNetTF:
+  def __init__(self):
+    self.x = tf.Variable(x_init.copy(), trainable=True)
+    self.W = tf.Variable(W_init.copy(), trainable=True)
+    self.m = tf.constant(m_init.copy())
+
+  def forward(self):
+    out = tf.matmul(self.x, self.W)
+    out = tf.nn.relu(out)
+    out = tf.nn.log_softmax(out, axis=1)
+    out = tf.multiply(out, self.m) + self.m
+    out = tf.reduce_sum(out)
+    return out
+
+def step(optim, steps=1, kwargs={}):
+  net = TinyNet()
+  optim = optim([net.x, net.W], **kwargs)
+  for _ in range(steps):
+    out = net.forward()
+    optim.zero_grad()
+    out.backward()
+    optim.step()
+  return net.x.detach().numpy(), net.W.detach().numpy()
+
+def step_tf(optim, steps=1, kwargs={}):
+  net = TinyNetTF()
+  optim = optim(**kwargs)
+  for _ in range(steps):
+    with tf.GradientTape() as tape:
+      out = net.forward()
+    grads = tape.gradient(out, [net.x, net.W])
+    optim.apply_gradients(zip(grads, [net.x, net.W]))
+  return net.x.numpy(), net.W.numpy()
+
+class ExternalTestOptim(unittest.TestCase):
+  def _test_optim(self, tinygrad_optim, tensorflow_optim, steps, opts, atol, rtol):
+    for x,y in zip(step(tinygrad_optim, steps, kwargs=opts),
+                   step_tf(tensorflow_optim, steps, kwargs=opts)):
+      np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
+
+  def _test_lamb(self, steps, opts, atol, rtol): self._test_optim(LAMB, tfa.optimizers.LAMB, steps, opts, atol, rtol)
+
+  def test_lamb(self): self._test_lamb(1, {'lr': 0.001}, 1e-5, 0)
+  def test_lamb_high_lr(self): self._test_lamb(1, {'lr': 10}, 1e-5, 1e-5)
+
+  def test_multistep_lamb(self): self._test_lamb(10, {'lr': 0.001}, 1e-5, 0)
+  def test_multistep_lamb_high_lr(self): self._test_lamb(10, {'lr': 10}, 1e-5, 3e-4)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_speed_llama.py
+++ b/tinygrad_repo/test/external/external_test_speed_llama.py
@@ -0,0 +1,57 @@
+# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
+import unittest, time
+import numpy as np
+from examples.llama import Transformer, MODEL_PARAMS
+from test.test_net_speed import start_profile, stop_profile
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+from tinygrad.nn.state import get_state_dict
+from tinygrad.ops import Compiled
+from tinygrad.helpers import dtypes, prod
+from tinygrad.runtime.lib import RawBuffer
+
+class FakeProgram:
+  def __init__(self, name:str, prg:str): pass
+  def __call__(self, *bufs, global_size, local_size, wait=False): pass
+
+class RawFakeBuffer(RawBuffer):
+  @classmethod
+  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
+  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
+
+class TestLLaMASpeed(unittest.TestCase):
+  @unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
+  def test_llama_compile(self):
+    backup_program = Device[Device.DEFAULT].runtime
+    backup_buffer = Device[Device.DEFAULT].buffer
+    Device[Device.DEFAULT].runtime = FakeProgram
+    Device[Device.DEFAULT].buffer = RawFakeBuffer
+
+    print("testing llama python run time")
+    model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
+    print("built model")
+    # assign fake tensors to the values
+    for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
+    print("assigned empty tensors, doing warmup")
+
+    def run_llama(st, empty_method_cache=True):
+      if empty_method_cache: Device[Device.DEFAULT].method_cache.clear()
+      tms = [time.perf_counter()]
+      for i in range(10):
+        model(Tensor([[2]]), i).realize()
+        tms.append(time.perf_counter())
+      timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
+      print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
+
+    run_llama("codegen")
+    run_llama("methodcache", False)
+
+    pr = start_profile()
+    run_llama("profile")
+    stop_profile(pr, sort='time', frac=0.1)
+
+    Device[Device.DEFAULT].runtime = backup_program
+    Device[Device.DEFAULT].buffer = backup_buffer
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_uops_graphing.py
+++ b/tinygrad_repo/test/external/external_test_uops_graphing.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.tensor import Tensor
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.renderer.opencl import OpenCLRenderer
+from tinygrad.graph import graph_uops
+from tinygrad.nn import Conv2d
+
+class TestUopsGraph(unittest.TestCase):
+  def test_matmul(self):
+    N = 1024
+    a = Tensor.rand(N,N)
+    b = Tensor.rand(N,N)
+    si = (a@b).lazydata.schedule()[-1]
+    lin = Linearizer(si.ast)
+    lin.hand_coded_optimizations()
+    print(lin.colored_shape())
+    uops = lin.linearize().uops
+    graph_uops(uops)
+    for u in uops: print(u)
+    print(OpenCLRenderer("matmul", uops)[0])
+
+  def test_reduce(self):
+    a = Tensor.rand(1024*1024)
+    si = a.sum().lazydata.schedule()[-1]
+    lin = Linearizer(si.ast)
+    lin.hand_coded_optimizations()
+    uops = lin.linearize().uops
+    graph_uops(uops)
+    #print(OpenCLRenderer("reduce", uops)[0])
+
+  def test_conv(self):
+    x = Tensor.rand(1,3,16,16)
+    c = Conv2d(3, 16, (3,3))
+    si = c(x).elu().lazydata.schedule()[-1]
+    lin = Linearizer(si.ast)
+    lin.hand_coded_optimizations()
+    uops = lin.linearize().uops
+    graph_uops(uops)
+    print(lin.colored_shape())
+    print(OpenCLRenderer("conv", uops)[0])
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_yolo.py
+++ b/tinygrad_repo/test/external/external_test_yolo.py
@@ -0,0 +1,36 @@
+import io
+import unittest
+from pathlib import Path
+
+import cv2
+import requests  # type: ignore
+import numpy as np
+
+from tinygrad.tensor import Tensor
+from examples.yolov3 import Darknet, infer, show_labels
+from extra.utils import fetch
+
+chicken_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/Chicken.jpg'))
+car_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/car.jpg'))
+
+class TestYOLO(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"))
+    print("Loading weights file (237MB). This might take a while…")
+    cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
+    self.assertEqual(labels, ["bird"])
+
+  def test_car(self):
+    labels = show_labels(infer(self.model, car_img))
+    self.assertEqual(labels, ["car"])
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_yolov8.py
+++ b/tinygrad_repo/test/external/external_test_yolov8.py
@@ -0,0 +1,76 @@
+import numpy as np
+from extra.utils import fetch, download_file, get_child
+from examples.yolov8 import YOLOv8, get_variant_multiples, preprocess, postprocess, label_predictions
+from pathlib import Path
+import unittest
+import io, cv2, os
+import onnxruntime as ort
+import ultralytics
+from tinygrad.nn.state import safe_load, load_state_dict
+
+class TestYOLOv8(unittest.TestCase):
+  def test_all_load_weights(self):
+    for variant in ['n', 's', 'm', 'l', 'x']:
+      weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
+      download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{variant}.safetensors', weights_location)
+
+      depth, width, ratio = get_variant_multiples(variant)
+      TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+      state_dict = safe_load(weights_location)
+      load_state_dict(TinyYolov8, state_dict)
+      print(f'successfully loaded weights for yolov{variant}')
+
+  def test_predictions(self):
+    test_image_urls = ['https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg', 'https://www.aljazeera.com/wp-content/uploads/2022/10/2022-04-28T192650Z_1186456067_UP1EI4S1I0P14_RTRMADP_3_SOCCER-ENGLAND-MUN-CHE-REPORT.jpg']
+    variant = 'n'
+    weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
+    depth, width, ratio = get_variant_multiples(variant)
+    TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+    state_dict = safe_load(weights_location)
+    load_state_dict(TinyYolov8, state_dict)
+
+    for i in range(len(test_image_urls)):
+      img_stream = io.BytesIO(fetch(test_image_urls[i]))
+      img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
+      test_image = preprocess([img])
+      predictions = TinyYolov8(test_image)
+      post_predictions = postprocess(preds=predictions, img=test_image, orig_imgs=[img])
+      labels = label_predictions(post_predictions)
+      assert labels == {5: 1, 0: 4, 11: 1} if i == 0 else labels == {0: 13, 29: 1, 32: 1}
+
+  def test_forward_pass_torch_onnx(self):
+    variant = 'n'
+    weights_location_onnx = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.onnx'
+    weights_location_pt = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.pt'
+    weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
+
+    download_file(f'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8{variant}.pt', weights_location_pt)
+    # the ultralytics export prints a lot of unneccesary things
+    if not weights_location_onnx.is_file():
+      model = ultralytics.YOLO(model=weights_location_pt, task='Detect')
+      model.export(format="onnx",imgsz=[640, 480])
+
+    depth, width, ratio = get_variant_multiples(variant)
+    TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+    state_dict = safe_load(weights_location)
+    load_state_dict(TinyYolov8, state_dict)
+
+    image_location = [np.frombuffer(io.BytesIO(fetch('https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg')).read(), np.uint8)]
+    orig_image = [cv2.imdecode(image_location[0], 1)]
+
+    input_image = preprocess(orig_image)
+
+    onnx_session = ort.InferenceSession(weights_location_onnx)
+    onnx_input_name = onnx_session.get_inputs()[0].name
+    onnx_output_name = onnx_session.get_outputs()[0].name
+    onnx_output = onnx_session.run([onnx_output_name], {onnx_input_name: input_image.numpy()})
+
+    tiny_output = TinyYolov8(input_image)
+
+    # currently rtol is 0.025 because there is a 1-2% difference in our predictions
+    # because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch.
+    # This difference does not make a difference "visually".
+    np.testing.assert_allclose(onnx_output[0], tiny_output.numpy(), atol=5e-4, rtol=0.025)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/fuzz_shapetracker.py
+++ b/tinygrad_repo/test/external/fuzz_shapetracker.py
@@ -0,0 +1,61 @@
+import random
+from tinygrad.helpers import DEBUG
+from test.unit.test_shapetracker import CheckingShapeTracker
+random.seed(42)
+
+def do_permute(st):
+  perm = list(range(0, len(st.shape)))
+  random.shuffle(perm)
+  perm = tuple(perm)
+  if DEBUG >= 1: print("st.permute(", perm, ")")
+  st.permute(perm)
+
+def do_pad(st):
+  c = random.randint(0, len(st.shape)-1)
+  pad = tuple((random.randint(0,2), random.randint(0,2)) if i==c else (0,0) for i in range(len(st.shape)))
+  if DEBUG >= 1: print("st.pad(", pad, ")")
+  st.pad(pad)
+
+def do_reshape_split_one(st):
+  c = random.randint(0, len(st.shape)-1)
+  poss = [n for n in [1,2,3,4,5] if st.shape[c]%n == 0]
+  spl = random.choice(poss)
+  shp = st.shape[0:c] + (st.shape[c]//spl, spl) + st.shape[c+1:]
+  if DEBUG >= 1: print("st.reshape(", shp, ")")
+  st.reshape(shp)
+
+def do_reshape_combine_two(st):
+  if len(st.shape) < 2: return
+  c = random.randint(0, len(st.shape)-2)
+  shp = st.shape[:c] + (st.shape[c] * st.shape[c+1], ) + st.shape[c+2:]
+  if DEBUG >= 1: print("st.reshape(", shp, ")")
+  st.reshape(shp)
+
+def do_shrink(st):
+  c = random.randint(0, len(st.shape)-1)
+  while 1:
+    shrink = tuple((random.randint(0,s), random.randint(0,s)) if i == c else (0,s) for i,s in enumerate(st.shape))
+    if all(x<y for (x,y) in shrink): break
+  if DEBUG >= 1: print("st.shrink(", shrink, ")")
+  st.shrink(shrink)
+
+def do_stride(st):
+  c = random.randint(0, len(st.shape)-1)
+  stride = tuple(random.choice([-2,-1,2]) if i==c else 1 for i in range(len(st.shape)))
+  if DEBUG >= 1: print("st.stride(", stride, ")")
+  st.stride(stride)
+
+def do_expand(st):
+  c = [i for i,s in enumerate(st.shape) if s==1]
+  if len(c) == 0: return
+  c = random.choice(c)
+  expand = tuple(random.choice([2,3,4]) if i==c else s for i,s in enumerate(st.shape))
+  if DEBUG >= 1: print("st.expand(", expand, ")")
+  st.expand(expand)
+
+if __name__ == "__main__":
+  ops = [do_permute, do_pad, do_shrink, do_reshape_split_one, do_reshape_combine_two, do_stride, do_expand]
+  for _ in range(200):
+    st = CheckingShapeTracker((random.randint(2, 10), random.randint(2, 10), random.randint(2, 10)))
+    for i in range(8): random.choice(ops)(st)
+    st.assert_same()
--- a/tinygrad_repo/test/external/fuzz_symbolic.py
+++ b/tinygrad_repo/test/external/fuzz_symbolic.py
@@ -0,0 +1,69 @@
+import itertools
+import random
+from tinygrad.helpers import DEBUG
+from tinygrad.shape.symbolic import Variable
+random.seed(42)
+
+def add_v(expr, rng=None):
+  if rng is None: rng = random.randint(0,2)
+  return expr + v[rng], rng
+
+def div(expr, rng=None):
+  if rng is None: rng = random.randint(1,9)
+  return expr // rng, rng
+
+def mul(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr * rng, rng
+
+def mod(expr, rng=None):
+  if rng is None: rng = random.randint(1,9)
+  return expr % rng, rng
+
+def add_num(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr + rng, rng
+
+def lt(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr < rng, rng
+
+def ge(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr >= rng, rng
+
+def le(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr <= rng, rng
+
+def gt(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr > rng, rng
+
+if __name__ == "__main__":
+  ops = [add_v, div, mul, add_num, mod]
+  for _ in range(1000):
+    upper_bounds = [*list(range(1, 10)), 16, 32, 64, 128, 256]
+    u1 = Variable("v1", 0, random.choice(upper_bounds))
+    u2 = Variable("v2", 0, random.choice(upper_bounds))
+    u3 = Variable("v3", 0, random.choice(upper_bounds))
+    v = [u1,u2,u3]
+    tape = [random.choice(ops) for _ in range(random.randint(2, 30))]
+    # 10% of the time, add one of lt, le, gt, ge
+    if random.random() < 0.1: tape.append(random.choice([lt, le, gt, ge]))
+    expr = Variable.num(0)
+    rngs = []
+    for t in tape:
+      expr, rng = t(expr)
+      if DEBUG >= 1: print(t.__name__, rng)
+      rngs.append(rng)
+    if DEBUG >=1: print(expr)
+    space = list(itertools.product(range(u1.min, u1.max+1), range(u2.min, u2.max+1), range(u3.min, u3.max+1)))
+    volume = len(space)
+    for (v1, v2, v3) in random.sample(space, min(100, volume)):
+      v = [v1,v2,v3]
+      rn = 0
+      for t,r in zip(tape, rngs): rn, _ = t(rn, r)
+      num = eval(expr.render())
+      assert num == rn, f"mismatched {expr.render()} at {v1=} {v2=} {v3=} = {num} != {rn}"
+      if DEBUG >= 1: print(f"matched {expr.render()} at {v1=} {v2=} {v3=} = {num} == {rn}")
--- a/tinygrad_repo/test/external/graph_batchnorm.py
+++ b/tinygrad_repo/test/external/graph_batchnorm.py
@@ -0,0 +1,61 @@
+import unittest
+from tinygrad.nn.state import get_parameters
+from tinygrad.tensor import Tensor
+from tinygrad.nn import Conv2d, BatchNorm2d, optim
+
+def model_step(lm):
+  with Tensor.train():
+    x = Tensor.ones(8,12,128,256, requires_grad=False)
+    optimizer = optim.SGD(get_parameters(lm), lr=0.001)
+    loss = lm.forward(x).sum()
+    optimizer.zero_grad()
+    loss.backward()
+    del x,loss
+    optimizer.step()
+
+class TestBatchnorm(unittest.TestCase):
+  def test_conv(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
+      def forward(self, x):
+        return self.c(x).relu()
+    lm = LilModel()
+    model_step(lm)
+
+  def test_two_conv(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
+        self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
+      def forward(self, x):
+        return self.c2(self.c(x)).relu()
+    lm = LilModel()
+    model_step(lm)
+
+  def test_two_conv_bn(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 24, 3, padding=1, bias=False)
+        self.bn = BatchNorm2d(24, track_running_stats=False)
+        self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
+        self.bn2 = BatchNorm2d(32, track_running_stats=False)
+      def forward(self, x):
+        x = self.bn(self.c(x)).relu()
+        return self.bn2(self.c2(x)).relu()
+    lm = LilModel()
+    model_step(lm)
+
+  def test_conv_bn(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
+        self.bn = BatchNorm2d(32, track_running_stats=False)
+      def forward(self, x):
+        return self.bn(self.c(x)).relu()
+    lm = LilModel()
+    model_step(lm)
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/test_example.py
+++ b/tinygrad_repo/test/external/test_example.py
@@ -0,0 +1,74 @@
+import unittest
+import numpy as np
+from tinygrad.ops import Device
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import getenv, CI
+
+def multidevice_test(fxn):
+  exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
+  def ret(self):
+    for device in Device._buffers:
+      if device in ["DISK", "SHM", "FAKE"]: continue
+      if not CI: print(device)
+      if device in exclude_devices:
+        if not CI: print(f"WARNING: {device} test is excluded")
+        continue
+      with self.subTest(device=device):
+        try:
+          Device[device]
+        except Exception:
+          if not CI: print(f"WARNING: {device} test isn't running")
+          continue
+        fxn(self, device)
+  return ret
+
+class TestExample(unittest.TestCase):
+  @multidevice_test
+  def test_convert_to_cpu(self, device):
+    a = Tensor([[1,2],[3,4]], device=device)
+    assert a.numpy().shape == (2,2)
+    b = a.cpu()
+    assert b.numpy().shape == (2,2)
+
+  @multidevice_test
+  def test_2_plus_3(self, device):
+    a = Tensor([2], device=device)
+    b = Tensor([3], device=device)
+    result = a + b
+    print(f"{a.numpy()} + {b.numpy()} = {result.numpy()}")
+    assert result.numpy()[0] == 5.
+
+  @multidevice_test
+  def test_example_readme(self, device):
+    x = Tensor.eye(3, device=device, requires_grad=True)
+    y = Tensor([[2.0,0,-2.0]], device=device, requires_grad=True)
+    z = y.matmul(x).sum()
+    z.backward()
+
+    x.grad.numpy()  # dz/dx
+    y.grad.numpy()  # dz/dy
+
+    assert x.grad.device == device
+    assert y.grad.device == device
+
+  @multidevice_test
+  def test_example_matmul(self, device):
+    try:
+      Device[device]
+    except Exception:
+      print(f"WARNING: {device} test isn't running")
+      return
+
+    x = Tensor.eye(64, device=device, requires_grad=True)
+    y = Tensor.eye(64, device=device, requires_grad=True)
+    z = y.matmul(x).sum()
+    z.backward()
+
+    x.grad.numpy()  # dz/dx
+    y.grad.numpy()  # dz/dy
+
+    assert x.grad.device == device
+    assert y.grad.device == device
+
+if __name__ == '__main__':
+  unittest.main()