Add openpilot tests

2024-03-06 14:58:47 -07:00
parent 2901597132
commit b39097a12d
259 changed files with 31176 additions and 12 deletions
--- a/tinygrad_repo/test/Dockerfile
+++ b/tinygrad_repo/test/Dockerfile
@@ -0,0 +1,12 @@
+FROM ubuntu:20.04
+
+# Install python3.8, and pip3
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.8 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install python dependencies
+COPY . ./tinygrad
+WORKDIR tinygrad
+RUN pip install -e .
--- a/tinygrad_repo/test/init.py
+++ b/tinygrad_repo/test/init.py
--- a/tinygrad_repo/test/external/dist/test_collectives.py
+++ b/tinygrad_repo/test/external/dist/test_collectives.py
@@ -0,0 +1,62 @@
+from extra import dist
+from tinygrad.jit import TinyJit
+if __name__ == "__main__":
+  dist.preinit()
+
+from extra.dist import collectives
+from tinygrad.helpers import CI, getenv
+from tinygrad.tensor import Tensor
+import numpy as np
+
+@TinyJit
+def allreduce_jit(t:Tensor, cache_id=None) -> Tensor:
+  return collectives.allreduce(t, cache_id=cache_id).realize()
+
+SIZE = 2048 if not CI else 2
+SIZE_2 = 255 if not CI else 3
+
+def run():
+  # set a deterministic seed so that both ranks generate the same random tensor
+  Tensor.manual_seed(42)
+
+  rank = getenv("RANK")
+
+  # loop 3 times to make sure it works with the jit
+  for _ in range(3):
+    # create a tensor to send
+    t = Tensor.zeros(SIZE, SIZE) if rank != 0 else Tensor.ones(SIZE, SIZE)
+    t2 = allreduce_jit(t.contiguous().realize(), cache_id="test")
+    assert np.allclose(np.ones((SIZE, SIZE)), t2.numpy()), f"{t2.numpy()} wasn't ones"
+
+  # reset jit
+  allreduce_jit.cnt = 0
+  allreduce_jit.input_replace = {}
+
+  # test uneven chunk sizes
+  for _ in range(3):
+    # create a tensor to send
+    t = Tensor.ones(SIZE_2, SIZE_2, SIZE_2) if rank == 0 else Tensor.zeros(SIZE_2, SIZE_2, SIZE_2)
+    t2 = allreduce_jit(t.contiguous().realize(), cache_id="test2")
+    assert np.allclose(np.ones((SIZE_2, SIZE_2, SIZE_2)), t2.numpy()), f"{t2.numpy()} wasn't ones"
+
+  print(f"rank {rank} passed")
+
+if __name__ == "__main__":
+  if getenv("HIP"):
+    from tinygrad.runtime.ops_hip import HIP
+    devices = [f"hip:{i}" for i in range(HIP.device_count)]
+  else:
+    from tinygrad.runtime.ops_gpu import CL
+    devices = [f"gpu:{i}" for i in range(len(CL.devices))] if not CI else ["gpu:0", "gpu:0"]
+  world_size = len(devices)
+
+  dist.init_oob(world_size)
+
+  processes = []
+  for rank, device in enumerate(devices):
+    processes.append(dist.spawn(rank, device, fn=run, args=()))
+  for p in processes: p.join()
+
+  # exit with error code if any of the processes failed
+  for p in processes:
+    if p.exitcode != 0: exit(p.exitcode)
--- a/tinygrad_repo/test/external/dist/test_world.py
+++ b/tinygrad_repo/test/external/dist/test_world.py
@@ -0,0 +1,68 @@
+from extra import dist
+from tinygrad.jit import TinyJit
+if __name__ == "__main__":
+  dist.preinit()
+
+from extra.dist import world
+from tinygrad.helpers import CI, getenv
+from tinygrad.tensor import Tensor
+import numpy as np
+
+@TinyJit
+def send_jit(t, target_rank, cache_id=None) -> Tensor:
+  return world.send(t, target_rank, cache_id=cache_id).realize()
+
+@TinyJit
+def recv_jit(t, target_rank, cache_id=None) -> Tensor:
+  return world.recv(t, target_rank, cache_id=cache_id).realize()
+
+SIZE = 2048 if not CI else 2
+
+def run():
+  # set a deterministic seed so that both ranks generate the same random tensor
+  Tensor.manual_seed(42)
+
+  rank = getenv("RANK")
+
+  # loop 3 times to make sure it works with the jit
+  for _ in range(3):
+    # create a tensor to send
+    t = Tensor.randn(SIZE, SIZE)
+
+    # send to rank 1
+    if rank == 0:
+      send_jit(t, 1, cache_id="test")
+    elif rank == 1:
+      t2 = Tensor.empty(SIZE, SIZE)
+      recv_jit(t2, 0, cache_id="test")
+
+    # recv from rank 1
+    if rank == 0:
+      t2 = Tensor.empty(SIZE, SIZE)
+      recv_jit(t2, 1, cache_id="test2")
+    elif rank == 1:
+      send_jit(t2, 0, cache_id="test2")
+
+    # check that the received tensor is the same as the sent tensor
+    if rank == 0:
+      assert np.allclose(t.numpy(), t2.numpy()), f"{t2.numpy()} wasn't equal to {t.numpy()}"
+
+  print(f"rank {rank} passed")
+
+if __name__ == "__main__":
+  if getenv("HIP"):
+    devices = ["hip:0", "hip:1"]
+  else:
+    devices = ["gpu:0", "gpu:1" if not CI else "gpu:0"]
+  world_size = len(devices)
+
+  dist.init_oob(world_size)
+
+  processes = []
+  for rank, device in enumerate(devices):
+    processes.append(dist.spawn(rank, device, fn=run, args=()))
+  for p in processes: p.join()
+
+  # exit with error code if any of the processes failed
+  for p in processes:
+    if p.exitcode != 0: exit(p.exitcode)
--- a/tinygrad_repo/test/external/external_copy_benchmark.py
+++ b/tinygrad_repo/test/external/external_copy_benchmark.py
@@ -0,0 +1,27 @@
+import unittest
+from tinygrad.helpers import prod
+from tinygrad.ops import Device
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import GlobalCounters
+from tinygrad.jit import CacheCollector
+
+class TestCopy(unittest.TestCase):
+  def test_add1(self):
+    pts = []
+    for i in range(16384, 16384*256, 16384):
+      t = Tensor.randn(i).realize()
+      CacheCollector.start()
+      t.assign(t+1).realize()
+      fxn, args, _ = CacheCollector.finish()[0]
+      GlobalCounters.reset()
+      def run(): return fxn(args, force_wait=True)
+      ct = min([run() for _ in range(10)])
+      mb = prod(t.shape)*t.dtype.itemsize*2*1e-6
+      print(f"{mb*1e3:.2f} kB, {ct*1e3:.2f} ms, {mb/ct:.2f} MB/s")
+      pts.append((mb, mb/ct))
+    from matplotlib import pyplot as plt
+    plt.plot([x[0] for x in pts], [x[1] for x in pts])
+    plt.show()
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_llama_eval.py
+++ b/tinygrad_repo/test/external/external_llama_eval.py
@@ -0,0 +1,102 @@
+from lm_eval.base import BaseLM
+from lm_eval import evaluator, tasks
+import torch, json, argparse
+
+from examples.llama import LLaMa
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+
+class LLaMaAdaptor(BaseLM):
+  def __init__(
+    self,
+    model_size="7B",
+    model_gen=1,
+    device="",
+    quantize=False,
+    batch_size=1,
+    max_batch_size=1,
+    do_sample=False,
+    temperature=1.0,
+    checkpoint_path="",
+    tokenizer_path="",
+  ):
+    super().__init__()
+
+    if batch_size is None:
+      batch_size = 1
+    self.do_sample = do_sample
+    self.temperature = temperature
+    self._device = device
+
+    assert isinstance(model_gen, int)
+    assert isinstance(model_size, str)
+    assert isinstance(batch_size, int)
+    assert isinstance(checkpoint_path, str)
+    assert isinstance(tokenizer_path, str)
+
+    self.llama = LLaMa.build(checkpoint_path, tokenizer_path, model_gen, model_size, quantize)
+
+  @classmethod
+  def create_from_arg_string(cls, arg_string, additional_config=None):
+    kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")}
+    return cls(**kwargs, **additional_config)
+
+  @property
+  def eot_token_id(self):
+    # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+    return self.llama.tokenizer.eos_id()
+
+  @property
+  def max_length(self):
+    return 1024
+
+  @property
+  def max_gen_toks(self):
+    return 256
+
+  @property
+  def batch_size(self):
+    return 1
+
+  @property
+  def device(self):
+    return self._device
+
+  def tok_encode(self, string: str):
+    return [self.llama.tokenizer.bos_id()] + self.llama.tokenizer.encode(string)
+
+  def tok_decode(self, tokens):
+    return self.llama.tokenizer.decode(tokens)
+
+  def _model_call(self, inps):
+    Tensor.no_grad = True
+    return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy())
+
+  def greedy_until(self, requests):
+    continuations = []
+    for request in requests:
+      prompt, until = request[0], request[1]['until']
+      output = self.llama.greedy_until(prompt, until, max_length=128, temperature=0.0)
+      continuations.append(output[len(prompt):])
+    return continuations
+
+  def _model_generate(self, context, max_length, eos_token_id):
+    raise NotImplementedError()
+
+if __name__ == '__main__':
+  print(f"using {Device.DEFAULT} backend")
+
+  parser = argparse.ArgumentParser(description='Run LLaMA evals in tinygrad', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  parser.add_argument('--size', type=str, default="7B", help="Size of model to use [7B, 13B, 30B, 65B] for Gen 1, [7B, 13B] for Gen 2")
+  parser.add_argument('--gen', type=int, default="1", help="Generation of the model to use [1, 2]")
+  parser.add_argument('--quantize', action='store_true', help="Quantize the weights to int8 in memory")
+  parser.add_argument('--eval', type=str, default="arc_easy", help="Run in evaluation mode")
+  parser.add_argument('--limit', type=int, default=None, help="Limit tests in eval")
+  parser.add_argument('--weights', type=str, default="./weights/LLaMa/", help="Location of the weights")
+  parser.add_argument('--tokenizer', type=str, default="./weights/LLaMa/tokenizer.model", help="Location of the tokenizer")
+  args = parser.parse_args()
+
+  # run eval and exit
+  adaptor = LLaMaAdaptor(model_gen=args.gen, model_size=args.size, quantize=args.quantize, checkpoint_path=args.weights, tokenizer_path=args.tokenizer, device="cpu")
+  results = evaluator.evaluate(adaptor, tasks.get_task_dict(args.eval.split(",")), False, 0, args.limit)
+  print(json.dumps(results, indent=2))
--- a/tinygrad_repo/test/external/external_model_benchmark.py
+++ b/tinygrad_repo/test/external/external_model_benchmark.py
@@ -0,0 +1,128 @@
+import csv, pathlib, time, numpy as np
+from os import getenv
+import torch
+torch.set_num_threads(1)
+import onnx
+from onnx.helper import tensor_dtype_to_np_dtype
+import onnxruntime as ort
+from onnx2torch import convert
+from extra.utils import download_file
+from extra.onnx import get_run_onnx
+from tinygrad.helpers import OSX, DEBUG
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+
+MODELS = {
+  "resnet50": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
+  "openpilot": "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx",
+  "efficientnet": "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx",
+  "shufflenet": "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx",
+  "commavq": "https://huggingface.co/commaai/commavq-gpt2m/resolve/main/gpt2m.onnx",
+
+  # broken in torch MPS
+  #"zfnet": "https://github.com/onnx/models/raw/main/vision/classification/zfnet-512/model/zfnet512-9.onnx",
+  # TypeError: BatchNormalization() got an unexpected keyword argument 'is_test'
+  #"densenet": "https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-3.onnx",
+  # AssertionError: only onnx version >= 10 supported for slice
+  #"bert": "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-8.onnx",
+  # really slow
+  #"resnet18": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx",
+}
+
+CSV = {}
+open_csv = None
+torch.manual_seed(1)
+
+def benchmark(mnm, nm, fxn):
+  tms = []
+  for _ in range(3):
+    st = time.perf_counter_ns()
+    ret = fxn()
+    tms.append(time.perf_counter_ns() - st)
+  print(f"{mnm:15s} {nm:25s} {min(tms)*1e-6:7.2f} ms")
+  CSV[nm] = min(tms)*1e-6
+  return min(tms), ret
+
+#BASE = pathlib.Path(__file__).parents[2] / "weights" / "onnx"
+BASE = pathlib.Path("/tmp/onnx")
+def benchmark_model(m, validate_outs=False):
+  global open_csv, CSV
+  CSV = {"model": m}
+
+  fn = BASE / MODELS[m].split("/")[-1]
+  download_file(MODELS[m], fn)
+  onnx_model = onnx.load(fn)
+  output_names = [out.name for out in onnx_model.graph.output]
+  excluded = {inp.name for inp in onnx_model.graph.initializer}
+  input_shapes = {inp.name:tuple(x.dim_value if x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded}
+  input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input if inp.name not in excluded}
+  #input_types = {k:v if v!=np.float16 else np.float32 for k,v in input_types.items()}  # cast
+  np_inputs = {k:torch.randn(shp).numpy().astype(input_types[k]) for k,shp in input_shapes.items()}
+  assert len(input_shapes) < 30, f"too many input shapes {len(input_shapes)}"
+
+  # print input names
+  if DEBUG >= 2: print([inp.name for inp in onnx_model.graph.input if inp.name not in excluded])
+
+  for device in ["METAL" if OSX else "GPU", "CLANG"]: # + (["CUDA"] if torch.cuda.is_available() else []):
+    Device.DEFAULT = device
+    inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
+    tinygrad_model = get_run_onnx(onnx_model)
+    benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
+
+    from tinygrad.jit import TinyJit
+    tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
+    for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
+    benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()})
+    del inputs, tinygrad_model, tinygrad_jitted_model
+
+  try:
+    torch_model = convert(onnx_model)
+    torch_inputs = [torch.tensor(x) for x in np_inputs.values()]
+    benchmark(m, "torch_cpu", lambda: torch_model(*torch_inputs))
+
+    torch_device = "mps" if OSX else "cuda"
+    torch_mps_model = torch_model.to(torch_device)
+    torch_mps_inputs = [x.to(torch_device) for x in torch_inputs]
+    benchmark(m, f"torch_{torch_device}", lambda: torch_mps_model(*torch_mps_inputs))
+  except Exception as e: print(f"{m:16s}onnx2torch {type(e).__name__:>25}")
+
+  # bench onnxruntime
+  ort_options = ort.SessionOptions()
+  ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+  ort_options.log_severity_level = 3  # no warnings
+  for backend in ["CPU", "CUDA" if not OSX else "CoreML"]:  # https://onnxruntime.ai/docs/execution-providers/
+    provider = backend+"ExecutionProvider"
+    if provider not in ort.get_available_providers(): continue
+    ort_sess = ort.InferenceSession(str(fn), ort_options, [provider])
+    benchmark(m, f"onnxruntime_{backend.lower()}", lambda: ort_sess.run(output_names, np_inputs))
+    del ort_sess
+
+  if validate_outs:
+    rtol, atol = 2e-3, 2e-3  # tolerance for fp16 models
+    inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
+    tinygrad_model = get_run_onnx(onnx_model)
+    tinygrad_out = tinygrad_model(inputs)
+
+    ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
+    onnx_out = ort_sess.run(output_names, np_inputs)
+    onnx_out = dict([*[(name,x) for name, x in zip(output_names, onnx_out)]])
+
+    assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
+    print(f"{m:16s}outputs validated with rtol={rtol:.1e}, atol={atol:.1e}")
+
+  if open_csv is None:
+    open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))
+    open_csv.writeheader()
+  open_csv.writerow(CSV)
+
+def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5):
+  assert len(tiny_out) == len(onnx_out) and tiny_out.keys() == onnx_out.keys()
+  for k in tiny_out.keys():
+    tiny_v, onnx_v = tiny_out[k], onnx_out[k]
+    if tiny_v is None: assert tiny_v == onnx_v
+    else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}")
+
+if __name__ == "__main__":
+  if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), True)
+  else:
+    for m in MODELS: benchmark_model(m, True)
--- a/tinygrad_repo/test/external/external_multi_gpu.py
+++ b/tinygrad_repo/test/external/external_multi_gpu.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
+# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import colored
+from tinygrad.helpers import Timing
+from tinygrad.runtime.ops_gpu import CL
+
+# TODO: support multidevice in cuda
+device = 'gpu'
+
+if __name__ == "__main__":
+  sz = 1024*1024*256  # 1 GB
+  #sz = 1024*64
+
+  with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
+    c0 = Tensor.ones(sz, device="cpu").realize()
+    c1 = (Tensor.ones(sz, device="cpu")/2).realize()
+
+  with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    a0 = c0.to(f'{device}:0').realize()
+    CL.synchronize()
+  with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    b1 = c1.to(f'{device}:1').realize()
+    CL.synchronize()
+
+  # cross copy. this is going through the CPU
+  with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    a1 = a0.to(f'{device}:1').realize()
+    CL.synchronize()
+  with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    b0 = b1.to(f'{device}:0').realize()
+    CL.synchronize()
+
+  # sum
+  with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    ab0 = (a0 + b0).realize()
+    CL.synchronize()
+  with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    ab1 = (a1 + b1).realize()
+    CL.synchronize()
+
+  # cross device sum (does this work?)
+  # is this making a copy first? is that copy through the CPU?
+  # the slowness comes from the *blocking* clprg call, is this pyopencl?
+  with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    abx0 = (a0 + b1).realize()
+    CL.synchronize()
+
+  with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    abx1 = (b1 + a0).realize()
+    CL.synchronize()
+
+  # copy back
+  # NOTE: half of this slowness is caused by allocating memory on the CPU
+  with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    cc0 = ab0.numpy()
+  with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    cc1 = ab1.numpy()
+
+  # same
+  print("testing")
+  np.testing.assert_allclose(cc0, cc1)
+
+  # devices
+  print(ab0)
+  print(ab1)
+  print(abx0)
+  print(abx1)
--- a/tinygrad_repo/test/external/external_osx_profiling.py
+++ b/tinygrad_repo/test/external/external_osx_profiling.py
@@ -0,0 +1,41 @@
+from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
+from tinygrad.helpers import dtypes
+import time
+
+N = 1000000
+a = CLBuffer(N, dtypes.float32)
+b = CLBuffer(N, dtypes.float32)
+c = CLBuffer(N, dtypes.float32)
+
+prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
+  int idx = get_global_id(0);
+  a[idx] = b[idx] + c[idx];
+}""")
+prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
+t1 = time.monotonic_ns()
+e1 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
+CL.synchronize()
+t2 = time.monotonic_ns()
+time.sleep(3)
+t3 = time.monotonic_ns()
+e2 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
+CL.synchronize()
+t4 = time.monotonic_ns()
+
+print(e1.profile.queued)
+print(e1.profile.submit)
+print(e1.profile.start)
+print(e1.profile.end)
+
+print(e1, e2)
+print(t2-t1, e1.profile.end - e1.profile.start)
+print(t4-t3, e2.profile.end - e2.profile.start)
+print(t3-t2, e2.profile.queued-e1.profile.end)
+print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
+
+print("ratio since boot", t1/e1.profile.start)
+
+print(e1.profile.start)
+print(e1.profile.end)
+print(e2.profile.start)
+print(e2.profile.end)
--- a/tinygrad_repo/test/external/external_test_allocator_on_models.py
+++ b/tinygrad_repo/test/external/external_test_allocator_on_models.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+import unittest, gc
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.nn.state import get_state_dict
+from tinygrad.helpers import GlobalCounters
+from tinygrad.runtime.lib import RawBuffer, LRUAllocator
+from tinygrad.helpers import dtypes, prod
+from tinygrad.ops import Device
+from test.helpers import derandomize_model
+
+from examples.llama import Transformer
+
+ALLOCATED_DEV_BUFS = 0
+class FakeDeviceBuffer:
+  def __init__(self, sz, dt, device):
+    self.id = 1
+    self.size = sz
+    self.dtype = dt
+    self.device = device
+
+    global ALLOCATED_DEV_BUFS
+    ALLOCATED_DEV_BUFS += 1
+class FakeAllocator(LRUAllocator):
+  def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
+  def _do_free(self, buf):
+    buf.id -= 1
+    assert buf.id == 0, f"Free should be called once, but {buf.id}"
+  def __del__(self): # Fake allocator should clear all buffers after each test.
+    for v in self.cached_buffers.values():
+      for buf, _ in v: self._free_buffer(buf)
+
+FAKE_GLOBAL_ALLOCATOR = None
+class FakeBuffer(RawBuffer):
+  def __init__(self, size, dtype, device='0'):
+    global FAKE_GLOBAL_ALLOCATOR
+    super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
+    assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
+  @classmethod
+  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
+  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
+class FakeProgram:
+  def __init__(self, name:str, prg:str): pass
+  def __call__(self, *bufs, global_size, local_size, wait=False): pass
+
+def helper_test_correctness(gen, train):
+  from tinygrad.runtime.ops_gpu import CL, CLAllocator
+  old_alloc = CL.cl_allocator
+  CL.cl_allocator = CLAllocator(0)
+  no_alloc_result = train(*gen()).numpy()
+  Device[Device.DEFAULT].synchronize()
+  CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
+  for _ in range(4):
+    GlobalCounters.reset()
+    np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
+    Device[Device.DEFAULT].synchronize()
+  assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
+  CL.cl_allocator = old_alloc
+
+def __helper_test_alloc_count(gen, train):
+  was_alloc = ALLOCATED_DEV_BUFS
+  for _ in range(2):
+    train(*gen())
+  return ALLOCATED_DEV_BUFS - was_alloc
+
+def helper_test_alloc_count(mm, gen, train):
+  global FAKE_GLOBAL_ALLOCATOR
+  backup_program = Device[Device.DEFAULT].runtime
+  backup_buffer = Device[Device.DEFAULT].buffer
+  Device[Device.DEFAULT].runtime = FakeProgram
+  Device[Device.DEFAULT].buffer = FakeBuffer
+  Device[Device.DEFAULT].method_cache.clear()
+  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
+  new_allocs = __helper_test_alloc_count(gen, train)
+  Device[Device.DEFAULT].method_cache.clear()
+  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
+  old_allocs = __helper_test_alloc_count(gen, train)
+  print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
+  assert new_allocs < old_allocs, f"Hmm, doesn't cache work any more?"
+  Device[Device.DEFAULT].runtime = backup_program
+  Device[Device.DEFAULT].buffer = backup_buffer
+  FAKE_GLOBAL_ALLOCATOR = None
+
+def check_gc():
+  if Device.DEFAULT == "GPU":
+    gc.collect() # Need to collect Tensors.
+    from extra.introspection import print_objects
+    assert print_objects() == 0
+
+class TestAllocators(unittest.TestCase):
+  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+  def test_lru_allocator_tiny_llama(self):
+    old_type = Tensor.default_type
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    def __test():
+      model = Transformer(**args_tiny)
+      derandomize_model(model)
+      def test(t): return model(t, 0).realize()
+      helper_test_correctness(lambda: (Tensor([[1,]]),), test)
+    __test()
+    Tensor.default_type = old_type
+    check_gc()
+
+  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+  def test_lru_allocator_tiny_llama_alloc_counts(self):
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    def test_alloc_count(t):
+      model = Transformer(**args_tiny)
+      for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
+      return model(t, 0).realize()
+    helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
+    check_gc()
+
+  @unittest.skip("huge for CI")
+  def test_stable_diffusion(self):
+    from examples.stable_diffusion import UNetModel
+    model = UNetModel()
+    derandomize_model(model)
+    def test(t, t2): return model(t, 801, t2).realize()
+    helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_embedding.py
+++ b/tinygrad_repo/test/external/external_test_embedding.py
@@ -0,0 +1,8 @@
+from tinygrad.tensor import Tensor
+from tinygrad.nn import Embedding
+
+if __name__ == "__main__":
+  vocab_size = 50257
+  dim = 128
+  test = Embedding(vocab_size, dim)
+  ret = test(Tensor([[1,2,3]])).numpy()
--- a/tinygrad_repo/test/external/external_test_gpu_ast.py
+++ b/tinygrad_repo/test/external/external_test_gpu_ast.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps
+from tinygrad.shape.shapetracker import ShapeTracker, View, ZeroView
+from tinygrad.runtime.ops_gpu import GPUBuffer, CLProgram, CLCodegen
+#from tinygrad.runtime.ops_metal import MetalBuffer as GPUBuffer, MetalProgram as CLProgram, MetalCodegen as CLCodegen
+from tinygrad.helpers import getenv
+from extra.lib_test_ast import test_ast
+
+import platform
+OSX = platform.system() == "Darwin"
+
+def compile_and_test_ast(ast, local_size=None):
+  k = CLCodegen(ast)
+  prg = k.codegen().build(CLProgram)
+  if local_size is not None: prg.local_size = local_size
+  for i in range(5): prg(prg.lower(k.bufs))
+  if getenv("TEST", 0): test_ast(k)
+
+class TestAST(unittest.TestCase):
+  def test_conv_zeroview_ast(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (2, 2, 2, 1), -3), ZeroView((1, 1, 1, 2), ((0, 1), (0, 1), (-1, 2), (-1, 3))), View((1, 1, 3, 4), (0, 0, 4, 1), 0)]))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (0, 0, 0, 0), 0)]))
+    op1 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    ast = LazyOp(UnaryOps.RELU, (op1,), None)
+    compile_and_test_ast(ast)
+
+  def test_cifar_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 64, 34, 34), (65536, 1024, 32, 1), -33), ZeroView((512, 64, 32, 32), ((0, 512), (0, 64), (-1, 33), (-1, 33))), View((512, 1, 128, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 1, 128, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 64, 3, 3), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    ast = LazyOp(ReduceOps.SUM, (op0,), (512, 1, 128, 32, 32, 1, 1, 1))
+    compile_and_test_ast(ast)
+
+  def test_cifar_conv_backward(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 512, 10, 10), (64, 16384, 8, 1), -9), ZeroView((256, 512, 8, 8), ((0, 256), (0, 512), (-1, 9), (-1, 9))), View((256, 1, 512, 3, 3, 512, 8, 8), (51200, 51200, 0, 10, 1, 100, 10, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 256, 8, 8), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 1, 512, 3, 3, 512, 8, 8), (0, 0, 64, 0, 0, 32768, 8, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 512, 8, 8), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (256, 1, 512, 3, 3, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (256, 512, 3, 3))
+    compile_and_test_ast(ast)
+
+  def test_first_op_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 130, 258, 1, 12), (393216, 3072, 12, 12, 1), -3084), ZeroView((1, 128, 256, 1, 12), ((0, 1), (-1, 129), (-1, 257), (0, 1), (0, 12))), View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 6192, 24, 0, 0, 3096, 12, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 768, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 0, 0, 432, 4, 144, 16, 48, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 108, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    op3 = LazyOp(UnaryOps.RELU, (op2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    op4 = LazyOp(UnaryOps.EXP, (op2,), None)
+    op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
+    op6 = LazyOp(UnaryOps.RELU, (op5,), None)
+    op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
+    op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
+    compile_and_test_ast(ast)
+
+  def test_second_op_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 66, 130, 32, 1), (262144, 4096, 32, 1, 1), -4128), ZeroView((1, 64, 128, 32, 1), ((0, 1), (-1, 65), (-1, 129), (0, 32), (0, 1))), View((1, 64, 128, 8, 4, 1, 1, 3, 3), (266240, 4160, 32, 4, 1, 12480, 12480, 4160, 32), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 64, 128, 8, 4, 1, 1, 3, 3), (0, 0, 0, 36, 1, 0, 0, 12, 4), 0)]), hostbuf=GPUBuffer(shape=(8, 9, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    op3 = LazyOp(UnaryOps.RELU, (op2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    op4 = LazyOp(UnaryOps.EXP, (op2,), None)
+    op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
+    op6 = LazyOp(UnaryOps.RELU, (op5,), None)
+    op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
+    op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
+    compile_and_test_ast(ast)
+
+  def test_third_op_conv(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4))
+    compile_and_test_ast(ast)
+
+  # VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op
+  # 164 time 27.75 ms running re_S128_4            with [128]           None            count  4 runtime 1016.06 us      2.07 GFLOPS () -> (128, 1)
+  # 169 time 22.51 ms running matmul               with [4, 16, 128]    [4, 16, 16]     count  5 runtime  110.08 us     19.06 GFLOPS ('-DMATMUL',) -> (128, 1)
+  def test_reduce_op(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 128, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 128, 4, 1, 1, 1, 1), (512, 512, 512, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    op3 = LazyOp(UnaryOps.RELU, (op2,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
+    compile_and_test_ast(ast)
+
+  def test_alt_reduce_op(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
+    op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
+    compile_and_test_ast(ast)
+
+  # re_S32_16_36_6 is fast
+  def test_1x1_36_6(self):  # 36 <- 6
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 1536, 24, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 0, 0, 0, 0, 96, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(36, 24, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(144,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
+    op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
+    op4 = LazyOp(UnaryOps.EXP, (op2,), None)
+    op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
+    buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
+    op6 = LazyOp(BinaryOps.MAX, (op5,buf5,), None)
+    op7 = LazyOp(BinaryOps.SUB, (op3,op6,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op7,), (32, 2304, 4))
+    compile_and_test_ast(ast, None if OSX else (16, 16, 4))
+
+  # re_S32_16_6_36 is slow
+  def test_1x1_6_36(self):  # 6 <- 36
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 9216, 144, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 2304, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 0, 0, 0, 0, 576, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 144, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
+    op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 1536, 24, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
+    op3 = LazyOp(BinaryOps.ADD, (op2,buf3,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op3,), (32, 384, 4))
+    compile_and_test_ast(ast, (6, 16, 4))
+
+  # re_S32_16_6_24
+  def test_1x1_6_24(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 6144, 96, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 1536, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 0, 0, 0, 0, 384, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 96, 4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
+    #buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
+    #op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 384, 4))
+    compile_and_test_ast(ast, (6, 4, 8))
+
+  def test_full_reduce_op(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (512, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
+    op0 = LazyOp(BinaryOps.ADD, (buf0,buf1,), None)
+    buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([2.], dtype=np.float32)))
+    op1 = LazyOp(BinaryOps.POW, (op0,buf2,), None)
+    op2 = LazyOp(ReduceOps.SUM, (op1,), (1, 1))
+    buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([0.5], dtype=np.float32)))
+    op3 = LazyOp(BinaryOps.POW, (op2,buf3,), None)
+    buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
+    op4 = LazyOp(BinaryOps.SUB, (op3,buf4,), None)
+    op5 = LazyOp(UnaryOps.RELU, (op4,), None)
+    buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
+    op6 = LazyOp(BinaryOps.ADD, (op5,buf5,), None)
+    buf6 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([3.4e+38], dtype=np.float32)))
+    op7 = LazyOp(BinaryOps.SUB, (op3,buf6,), None)
+    op8 = LazyOp(UnaryOps.RELU, (op7,), None)
+    op9 = LazyOp(BinaryOps.SUB, (op6,op8,), None)
+    op10 = LazyOp(UnaryOps.RECIPROCAL, (op9,), None)
+    ast = LazyOp(MovementOps.RESHAPE, (op10,), (1, 1))
+    compile_and_test_ast(ast)
+
+  def test_1239_reduce(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 64, 4), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 1024, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(1239, 256,
+    4), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1239, 4, 1, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 1, 1, 1, 4956))
+    compile_and_test_ast(ast)
+
+  def test_enet_first_conv_bs32(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 1, 32, 112, 112, 3, 3, 3), (0, 0, 27, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 3, 3), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (8, 1, 32, 112, 112, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (8, 32, 112, 112))
+    compile_and_test_ast(ast)
+
+  def test_enet_reduce_bs32(self):
+    buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
+    buf1 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 1, 32, 3, 3, 32, 112, 112), (0, 12845056, 401408, 0, 0, 12544, 112, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 32, 1, 1, 32, 112, 112), force_create=True))
+    op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (3, 1, 32, 3, 3, 1, 1, 1))
+    ast = LazyOp(MovementOps.RESHAPE, (op1,), (3, 32, 3, 3))
+    compile_and_test_ast(ast)
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_image.py
+++ b/tinygrad_repo/test/external/external_test_image.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+import os
+import unittest
+import numpy as np
+if 'IMAGE' not in os.environ:
+  os.environ['IMAGE'] = '2'
+os.environ['GPU'] = '1'
+os.environ['OPT'] = '2'
+from tinygrad.tensor import Tensor
+from tinygrad.nn import Conv2d
+Tensor.no_grad = True
+
+class TestImage(unittest.TestCase):
+  def test_create_image(self):
+    t = Tensor.ones(128, 128, 1)
+    t = t.reshape(128, 32, 4) + 3
+    t.realize()
+    np.testing.assert_array_equal(t.numpy(), np.ones((128,32,4))*4)
+
+  def test_sum_image(self):
+    t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
+    t1.realize()
+    t1 = t1.sum()
+    t1.realize()
+    assert t1.numpy() == 16*4*4*4, f"got {t1.numpy()}"
+
+  def test_add_image(self):
+    t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
+    t2 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 4
+    t1.realize()
+    t2.realize()
+    t3 = t1 + t2
+    t3.realize()
+    np.testing.assert_array_equal(t3.numpy(), np.ones((16,4,4))*9)
+
+  def test_padded_conv(self):
+    bs, in_chans, out_chans = 1,12,32
+    tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
+    tiny_dat = Tensor.ones(bs, 12, 64, 128)
+    tiny_conv(tiny_dat).realize()
+
+  def test_op_conv(self):
+    bs, in_chans, out_chans = 1,12,32
+    tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
+    tiny_dconv = Conv2d(out_chans, out_chans, 1, bias=None, padding=0)
+    tiny_dat = Tensor.ones(bs, 12, 64, 128)
+    p2 = tiny_conv(tiny_dat).relu()
+    p2 = tiny_dconv(p2)
+    p2.realize()
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_jit_on_models.py
+++ b/tinygrad_repo/test/external/external_test_jit_on_models.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
+from tinygrad.helpers import dtypes, CI
+from tinygrad.ops import Device
+from test.helpers import derandomize_model
+
+from examples.llama import Transformer
+
+def helper_test_jitted_correctness(gen, train, train_jit):
+  nojit = train(*gen()).numpy()
+  for _ in range(5): jit = train_jit(*gen()).numpy()
+  np.testing.assert_allclose(nojit, jit, rtol=1e-3, atol=1e-5)
+
+@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE, "needs JIT")
+class TestJittedModels(unittest.TestCase):
+  def test_jitted_tiny_llama(self):
+    old_type = Tensor.default_type
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    model = Transformer(**args_tiny)
+    derandomize_model(model)
+    def test(t): return model(t, 0).realize()
+
+    @TinyJit
+    def test_jit(t): return model(t, 0).realize()
+    helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
+    Tensor.default_type = old_type
+
+  @unittest.skipUnless(not CI, "huge for CI")
+  def test_jitted_stable_diffusion(self):
+    from examples.stable_diffusion import UNetModel
+    model = UNetModel()
+    derandomize_model(model)
+    def test(t, t2): return model(t, 801, t2).realize()
+
+    @TinyJit
+    def test_jit(t, t2): return model(t, 801, t2).realize()
+    helper_test_jitted_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test, test_jit)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_onnx_backend.py
+++ b/tinygrad_repo/test/external/external_test_onnx_backend.py
@@ -0,0 +1,208 @@
+import unittest
+from onnx.backend.base import Backend, BackendRep
+import onnx.backend.test
+import numpy as np
+from tinygrad.tensor import Tensor
+from typing import Any, Tuple
+from tinygrad.helpers import getenv, CI
+
+# pip3 install tabulate
+pytest_plugins = 'onnx.backend.test.report',
+
+from extra.onnx import get_run_onnx
+
+class TinygradModel(BackendRep):
+  def __init__(self, run_onnx, input_names):
+    super().__init__()
+    self.fxn = run_onnx
+    self.input_names = input_names
+
+  def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
+    real_inputs = {k:v for k,v in zip(self.input_names, inputs)}
+    ret = self.fxn(real_inputs, debug=True)
+    return tuple(x.numpy() if isinstance(x, Tensor) else [i.numpy() for i in x] if isinstance(x, list) else np.array(x) for x in ret.values())
+
+class TinygradBackend(Backend):
+  @classmethod
+  def prepare(cls, model, device):
+    input_all = [x.name for x in model.graph.input]
+    input_initializer = [x.name for x in model.graph.initializer]
+    net_feed_input = [x for x in input_all if x not in input_initializer]
+    print("prepare", cls, device, net_feed_input)
+    run_onnx = get_run_onnx(model)
+    return TinygradModel(run_onnx, net_feed_input)
+
+  @classmethod
+  def supports_device(cls, device: str) -> bool:
+    return device == "CPU"
+
+backend_test = onnx.backend.test.BackendTest(TinygradBackend, __name__)
+
+# no support for reduce with multiply (needs llop)
+backend_test.exclude('test_reduce_prod_*')
+
+# TODO figure out why it's returning wrong values, geohotstan's uneducated guess is it's due to imprecision from float64 (double) -> float32
+# see Type Constraints: https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adam.html#type-constraints
+backend_test.exclude('test_adam_multiple_cpu')
+backend_test.exclude('test_nesterov_momentum_cpu')
+
+# we only support float32
+backend_test.exclude('uint8')
+backend_test.exclude('uint16')
+backend_test.exclude('uint32')
+backend_test.exclude('uint64')
+backend_test.exclude('int8')
+backend_test.exclude('int16')
+backend_test.exclude('float64')
+backend_test.exclude('string')
+
+backend_test.exclude('test_pow_types_int*')
+backend_test.exclude('test_cast_*')
+backend_test.exclude('test_castlike_*')
+backend_test.exclude('test_convinteger_*')
+backend_test.exclude('test_matmulinteger_*')
+
+backend_test.exclude('test_reduce_log_sum_exp*') # dependent on actual float64 implementation for backends
+backend_test.exclude('test_operator_add*') # dependent on float64 math. Without it values default to 0 or inf
+
+# we don't support indexes
+# backend_test.exclude('test_argmax_*') # Needs more work: select_last_index
+# backend_test.exclude('test_argmin_*') # Needs more work: select_last_index
+backend_test.exclude('test_nonzero_*')
+
+# no support for mod
+backend_test.exclude('test_mod_*')
+
+# no boolean ops (2d, 3d, 4d)
+backend_test.exclude('test_bitshift_*')
+
+# no scatternd gathernd
+backend_test.exclude('test_gathernd_*')
+backend_test.exclude('test_scatternd_*')
+
+# no quantize
+backend_test.exclude('test_dynamicquantizelinear_*')
+backend_test.exclude('test_qlinearmatmul_*')
+backend_test.exclude('test_qlinearconv_*')
+backend_test.exclude('test_quantizelinear_*')
+
+# no rnn
+backend_test.exclude('test_gru_*')
+backend_test.exclude('test_rnn_*')
+backend_test.exclude('test_lstm_*')
+backend_test.exclude('test_simple_rnn_*')
+
+# no control flow
+backend_test.exclude('test_if_*')
+backend_test.exclude('test_loop*')
+backend_test.exclude('test_range_float_type_positive_delta_expanded_cpu') # requires loop
+
+# unsupported (strange) ops
+backend_test.exclude('test_bitwise_*')
+backend_test.exclude('test_blackmanwindow_*')
+backend_test.exclude('test_bernoulli_*')
+backend_test.exclude('test_cumsum_*')
+backend_test.exclude('test_det_*')
+
+backend_test.exclude('test_tril_zero_cpu') # TODO: zero array support
+backend_test.exclude('test_triu_zero_cpu') # TODO: zero array support
+
+backend_test.exclude('test_col2im_*')
+backend_test.exclude('test_hammingwindow_*')
+backend_test.exclude('test_hannwindow_*')
+backend_test.exclude('test_hardmax_*')
+backend_test.exclude('test_gridsample_*')
+backend_test.exclude('test_dft_*')
+backend_test.exclude('test_einsum_*')
+backend_test.exclude('test_strnorm_*')
+backend_test.exclude('test_unique_*')
+backend_test.exclude('test_sequence_*')
+backend_test.exclude('test_nonmaxsuppression_*')
+backend_test.exclude('test_reversesequence_*')
+backend_test.exclude('test_roialign_*')
+backend_test.exclude('test_top_k_*')
+backend_test.exclude('test_tfidfvectorizer_*')
+backend_test.exclude('test_stft_*')
+backend_test.exclude('test_melweightmatrix_*')
+
+# more strange ops
+backend_test.exclude('test_basic_deform_conv_*')
+backend_test.exclude('test_deform_conv_*')
+backend_test.exclude('test_lppool_*')
+backend_test.exclude('test_depthtospace_*')
+backend_test.exclude('test_spacetodepth_*')
+backend_test.exclude('test_scan*')
+backend_test.exclude('test_split_to_sequence_*')
+backend_test.exclude('test_resize_downsample_scales_cubic_*') # unsure how to implement cubic
+backend_test.exclude('test_resize_downsample_sizes_cubic_*') # unsure how to implement cubic
+backend_test.exclude('test_resize_upsample_scales_cubic_*') # unsure how to implement cubic
+backend_test.exclude('test_resize_upsample_sizes_cubic_*') # unsure how to implement cubic
+
+# rest of the failing tests
+backend_test.exclude('test_averagepool_2d_dilations_cpu') # dilations != 1 not supported for avgpool
+backend_test.exclude('test_convtranspose_autopad_same_cpu') # TODO geohotstan has no idea how this is done, autopad requires output_shape but output_shape requires pads from autopad
+backend_test.exclude('test_optional_has_element_empty_optional_input_cpu') # Attempts to create Tensor from None
+backend_test.exclude('test_range_int32_type_negative_delta_expanded_cpu') # AttributeProto.GRAPH not implemented
+backend_test.exclude('test_reshape_allowzero_reordered_cpu') # reshaping to 0 shape
+backend_test.exclude('test_resize_downsample_scales_linear_antialias_cpu') # antialias not implemented
+backend_test.exclude('test_resize_downsample_sizes_linear_antialias_cpu') # antialias not implemented
+backend_test.exclude('test_resize_tf_crop_and_resize_cpu') # unsure about fill value after clip
+backend_test.exclude('test_operator_addconstant_cpu') # bad data type
+
+# issue 1556 https://github.com/tinygrad/tinygrad/issues/1556
+backend_test.exclude('test_isinf_cpu')
+backend_test.exclude('test_isinf_negative_cpu')
+backend_test.exclude('test_isinf_positive_cpu')
+backend_test.exclude('test_isnan_cpu')
+
+# issue 1791 fast math messes with these https://github.com/tinygrad/tinygrad/issues/1791
+backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
+backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
+backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
+
+# issue 2067 potentially also a fastmath issue https://github.com/tinygrad/tinygrad/issues/2067
+if getenv('METAL'):
+  backend_test.exclude('test_maxpool_2d_pads_cpu')
+  backend_test.exclude('test_maxpool_2d_same_lower_cpu')
+
+# Don't know how to treat special TensorProto like TensorProto.FLOAT8E4M3FN
+if getenv("CPU") or getenv("TORCH"):
+  backend_test.exclude('test_dequantizelinear_axis_cpu')
+  backend_test.exclude('test_dequantizelinear_cpu')
+
+# compiled backends cannot reshape to and from 0
+if getenv('LLVM') or getenv('GPU') or getenv('CLANG') or getenv('METAL') or getenv('CUDA'):
+  backend_test.exclude('test_slice_start_out_of_bounds_cpu')
+  backend_test.exclude('test_constantofshape_int_shape_zero_cpu')
+
+if getenv('GPU') or getenv('METAL'):
+  backend_test.exclude('test_mish_cpu') # weird inaccuracy
+  backend_test.exclude('test_mish_expanded_cpu') # weird inaccuracy
+  backend_test.exclude('test_eyelike_with_dtype_cpu') # backend does not support dtype: Double
+
+# Segfaults in CI
+if (getenv('LLVM') or getenv('CUDA')) and CI:
+  backend_test.exclude('test_max_float16_cpu')
+  backend_test.exclude('test_min_float16_cpu')
+
+# disable model tests for now since they are slow
+if not getenv("MODELTESTS"):
+  for x in backend_test.test_suite:
+    if 'OnnxBackendRealModelTest' in str(type(x)):
+      backend_test.exclude(str(x).split(" ")[0])
+else:
+  # model tests all pass!
+  backend_test.include('test_resnet50')
+  backend_test.include('test_inception_v1')
+  backend_test.include('test_inception_v2')
+  backend_test.include('test_densenet121')
+  backend_test.include('test_shufflenet')
+  backend_test.include('test_squeezenet')
+  backend_test.include('test_bvlc_alexnet')
+  backend_test.include('test_zfnet512')
+  backend_test.include('test_vgg19')
+
+globals().update(backend_test.enable_report().test_cases)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_opt.py
+++ b/tinygrad_repo/test/external/external_test_opt.py
@@ -0,0 +1,392 @@
+#!/usr/bin/env python
+import os
+
+import torch
+if "OPT" not in os.environ:
+  os.environ["OPT"] = "2"
+
+import gc
+import numpy as np
+
+import unittest
+from tinygrad.tensor import Tensor, Device
+from tinygrad import nn
+from tinygrad.helpers import getenv
+from tinygrad.nn import optim
+from tinygrad.helpers import GlobalCounters
+from tinygrad.lazy import PUSH_PERMUTES
+from tinygrad.jit import CacheCollector
+
+class CLCache:
+  def __init__(self, allowed=None, strict=False, preclear=True): self.allowed, self.strict, self.preclear = allowed, strict, preclear
+  def __enter__(self):
+    if self.preclear:
+      gc.collect()
+      for x in [x for x in gc.get_objects() if isinstance(x, Tensor)]:
+        x.realize()
+      GlobalCounters.reset()
+    CacheCollector.start()
+    print("cache: entering")
+  def __exit__(self, type, value, traceback):
+    cache = CacheCollector.finish()
+    print(f"cache: exiting with size {len(cache)}", f"allowed {self.allowed}" if self.allowed is not None else "")
+    if self.allowed is not None:
+      assert len(cache) <= self.allowed and (not self.strict or len(cache) == self.allowed), f"used too many kernels! {len(cache)} > {self.allowed}"
+
+from models.convnext import ConvNeXt
+from models.efficientnet import EfficientNet
+from models.resnet import ResNet18
+from models.vit import ViT
+from tinygrad.nn.state import get_parameters
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestInferenceMinKernels(unittest.TestCase):
+  def setUp(self):
+    Tensor.training = False
+
+  @unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
+  def test_convnext(self):
+    model = ConvNeXt()
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(129):
+      model(img).realize()
+
+  def test_enet(self):
+    model = EfficientNet(getenv("ENET_NUM", 0), has_se=False)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(51):
+      model.forward(img).realize()
+
+  def test_enet_se(self):
+    model = EfficientNet(getenv("ENET_NUM", 0), has_se=True)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    # TODO: this seems very high
+    with CLCache(115):
+      model.forward(img).realize()
+
+  def test_resnet(self):
+    model = ResNet18()
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(26):
+      model.forward(img).realize()
+
+  def test_vit(self):
+    model = ViT(embed_dim=192, num_heads=3)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    img = Tensor.randn(1, 3, 224, 224)
+    with CLCache(222): # NOTE: this is way too high
+      out = model.forward(img)
+      assert len(CacheCollector.cache) == 0, "ViT prerealized?"
+      out.realize()
+
+  def test_llama(self):
+    from examples.llama import Transformer
+    args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
+    model = Transformer(**args_tiny)
+    for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
+    with CLCache(85):
+      model(Tensor([[1,2,3,4]]), 0).realize()
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOptBinOp(unittest.TestCase):
+  def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      c = f1(a, b)
+      if f2 is not None: d = f2(a, b)
+      c.realize()
+      if f2 is not None: d.realize()
+      assert len(CacheCollector.cache) == allowed, "binop was rerun!"
+    if f2 is not None: np.testing.assert_allclose(c.numpy().ravel(), d.numpy().ravel(), rtol=1e-3, atol=1e-5)
+
+  def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
+  def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
+  def test_no_binop_rerun_reduce_broadcast(self): return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
+  @unittest.skip("this test started failing with the new change, based movementop issue")
+  def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
+  def test_no_binop_rerun_mid_reshape(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(256)+a.reshape(256))
+
+  # currently non working tests
+  #def test_no_binop_rerun_preshape(self): return self._test_no_binop_rerun(lambda a,b: a.reshape(16, 16, 1)*b.reshape(16, 16, 1), lambda a,b: a*b)
+  #def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
+  #def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOptReduceLoop(unittest.TestCase):
+  @unittest.skip("this is broken")
+  def test_loop_left(self):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      t = a.sum(0)
+      b = t.reshape(16,1).expand(16,16).sum(0)
+      c = (t+b)
+      c.realize()
+      assert len(CacheCollector.cache) == 2, "loop left fusion broken"
+
+  def test_loop_right(self):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      t = a.sum(0)
+      b = t.reshape(16,1).expand(16,16).sum(0)
+      c = (b+t)
+      c.realize()
+      assert len(CacheCollector.cache) == 2, "loop right fusion broken"
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOptWChild(unittest.TestCase):
+  def test_unrealized_child(self):
+    a = Tensor.randn(16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      c = (a*b).sum()
+      d = c+1
+      e = c+2
+      d.realize()
+      assert len(CacheCollector.cache) == 2, "don't fuse if you have children"
+
+@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
+class TestOpt(unittest.TestCase):
+  def test_muladd(self):
+    a,b,c = [Tensor.ones(2,2) for _ in range(3)]
+    with CLCache():
+      d = a * b + c
+      d.realize()
+      assert len(CacheCollector.cache) == 1, "optimizer didn't fold muladd"
+    np.testing.assert_allclose(d.numpy(), np.ones((2,2))*2, rtol=1e-5)
+
+  def test_fold_reduce_elementwise(self):
+    img = Tensor.ones(32)
+    addme = Tensor.ones(1)
+    with CLCache():
+      ret = img.sum() + addme
+      ret.realize()
+      assert len(CacheCollector.cache) == 1, "optimizer didn't fold reduce/elementwise"
+    assert ret.numpy()[0] == 33
+
+  def test_fold_batchnorm(self):
+    with Tensor.train():
+      img = Tensor.ones(1,32,4,4)
+      bn = nn.BatchNorm2d(32, track_running_stats=False)
+      with CLCache():
+        img_bn = bn(img).realize()
+        print(img_bn)
+        assert len(CacheCollector.cache) == 3, f"optimizer didn't fold batchnorm, got {len(CacheCollector.cache)}"
+    # Tensor.training = False
+
+  def test_fold_conv_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(2,3,4,4)
+      c1 = nn.Conv2d(3,32,3)
+      opt = optim.SGD(get_parameters(c1))
+      with CLCache():
+        opt.zero_grad()
+        c1(img).relu().sum().backward()
+        opt.step()
+        # TODO: this should be 4, but the sum output child stays around
+        # with pushing_permutes it can be 3
+        # TODO: broken with optim fixes
+        assert len(CacheCollector.cache) in [4,5,6], f"optimizer didn't fold conv-backward SGD, got {len(CacheCollector.cache)}"
+    # Tensor.training = False
+
+  def test_fold_2convs_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(2,3,64,64)
+      c1 = nn.Conv2d(3,16,3,bias=False)
+      c2 = nn.Conv2d(16,32,3,bias=False)
+      opt = optim.SGD(get_parameters([c1, c2]))
+      with CLCache(allowed=9):
+        opt.zero_grad()
+        c2(c1(img).relu()).relu().sum().backward()
+        opt.step()
+    # Tensor.training = False
+
+  def test_fold_4convs_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(2,3,64,64)
+      c1 = nn.Conv2d(3,4,3,bias=False)
+      c2 = nn.Conv2d(4,8,3,bias=False)
+      c3 = nn.Conv2d(8,16,3,bias=False)
+      c4 = nn.Conv2d(16,32,3,bias=False)
+      opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
+      with CLCache(allowed=19):
+        opt.zero_grad()
+        c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
+        opt.step()
+    # Tensor.training = False
+
+  def test_fold_conv_batchnorm_sgd(self):
+    with Tensor.train():
+      img = Tensor.ones(1,3,4,4)
+      c1 = nn.Conv2d(3,32,3)
+      bn = nn.BatchNorm2d(32, track_running_stats=False)
+      opt = optim.SGD(get_parameters([c1, bn]))
+      with CLCache(allowed=18): # this is too high
+        img_bn = bn(c1(img)).elu().sum()
+        opt.zero_grad()
+        img_bn.backward()
+        opt.step()
+    # Tensor.training = False
+
+  def test_fold_conv_batchnorm_notrain(self):
+    img = Tensor.ones(1,3,8,8)
+    c1 = nn.Conv2d(3,32,3)
+    bn = nn.BatchNorm2d(32, track_running_stats=False)
+    # precache the bn
+    img_conv = bn(c1(img)).relu().realize()
+    with CLCache():
+      img_conv = bn(c1(img)).relu().realize()
+      assert len(CacheCollector.cache) == 1, f"optimizer didn't fold conv-batchnorm at test time, got {len(CacheCollector.cache)}"
+
+  def test_fold_conv_batchnorm(self):
+    with Tensor.train():
+      img = Tensor.ones(1,3,8,8)
+      c1 = nn.Conv2d(3,32,3)
+      bn = nn.BatchNorm2d(32, track_running_stats=False)
+      with CLCache():
+        img_conv = bn(c1(img)).relu().realize()
+        print(img_conv)
+        assert len(CacheCollector.cache) == 4, f"optimizer didn't fold conv-batchnorm, got {len(CacheCollector.cache)}"
+
+  def test_fold_conv_elu(self):
+    img = Tensor.ones(1,4,8,8)
+    c1 = nn.Conv2d(4, 4, kernel_size=3)
+    c2 = nn.Conv2d(4, 4, kernel_size=3)
+    with CLCache():
+      img_conv = img.sequential([c1, Tensor.elu, c2, Tensor.elu]).realize()
+      print(img_conv)
+      assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/elu"
+
+  def test_fold_conv_relu(self):
+    img = Tensor.ones(1,4,8,8)
+    c1 = nn.Conv2d(4, 4, kernel_size=3)
+    c2 = nn.Conv2d(4, 4, kernel_size=3)
+    with CLCache():
+      img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
+      print(img_conv)
+      assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
+
+  def test_fold_conv_relu_nobias(self):
+    img = Tensor.ones(1,4,8,8)
+    c1 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
+    c2 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
+    with CLCache():
+      img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
+      print(img_conv)
+      assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
+
+  def test_permute_was_pushed(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2)
+      d = c.permute(1,0).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  def test_permute_was_pushed_through_contract_reshape(self):
+    a = Tensor.randn(4, 4, 4, 4, 4)
+    with CLCache():
+      c = a.sum(-1)
+      d = c.reshape(16,16).permute(1,0).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,16).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  def test_permute_was_pushed_through_contractw1s_reshape(self):
+    a = Tensor.randn(4, 4, 4, 4, 4)
+    with CLCache():
+      c = a.sum(-1)
+      d = c.reshape(16,1,16).permute(2,1,0).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,1,16).transpose(2,1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  # TODO: push permute through expansion reshape
+  @unittest.skip("expansion can't push expand permute yet")
+  @unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
+  def test_permute_was_pushed_through_expand_reshape(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2)
+      d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0).reshape(4,4,4,4), d.numpy(), rtol=1e-3, atol=1e-5)
+    if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
+
+  @unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
+  def test_no_reduceop_rerun(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2)
+      d = a.sum(2).permute(1,0)
+      c.realize()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(c.numpy().transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
+    assert cache_len == 1, "reduceop was rerun!"
+
+  @unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
+  def test_no_reduceop_rerun_alt(self):
+    a = Tensor.randn(16, 16, 16)
+    with CLCache():
+      c = a.sum(2).permute(1,0)
+      d = a.sum(2)
+      c.realize()
+      d.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(c.numpy(), d.numpy().transpose(1,0), rtol=1e-3, atol=1e-5)
+    assert cache_len == 1, "reduceop was rerun!"
+
+  def test_fold_with_contiguous(self):
+    a = Tensor.randn(16, 16, 16)
+    b = Tensor.randn(16, 16)
+    with CLCache():
+      c = (a.sum(2).contiguous() + b).contiguous()
+      c.realize()
+      cache_len = len(CacheCollector.cache)
+    assert cache_len == 1, "contiguous wasn't folded"
+
+  def _test_fold_expand_reduce_helper(self, n, m, axis, allowed):
+    b = torch.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
+    with CLCache(allowed=allowed):
+      a = Tensor.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
+      a.realize()
+      cache_len = len(CacheCollector.cache)
+    np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
+    return cache_len
+
+  def test_expand_reduce_is_folded_on_same_axis(self):
+    for axis in [0, 1]:
+      for n in [4, 8, 16]:
+        b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
+        with CLCache(allowed=2):
+          a = Tensor.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
+          a.realize()
+          cache_len = len(CacheCollector.cache)
+        np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
+        return cache_len
+
+  def test_expand_reduce_is_not_folded_on_different_axes(self):
+    axis1, axis2 = 0, 1
+    for n in [4, 8, 16]:
+      b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
+      with CLCache(allowed=3):
+        a = Tensor.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
+        a.realize()
+        cache_len = len(CacheCollector.cache)
+      np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
+      return cache_len
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_optim.py
+++ b/tinygrad_repo/test/external/external_test_optim.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+import tensorflow as tf
+import tensorflow_addons as tfa
+from tinygrad.tensor import Tensor
+from tinygrad.nn.optim import LAMB
+
+np.random.seed(1337)
+x_init = np.random.randn(1,4).astype(np.float32)
+W_init = np.random.randn(4,4).astype(np.float32)
+m_init = np.random.randn(1,4).astype(np.float32)
+
+class TinyNet:
+  def __init__(self):
+    self.x = Tensor(x_init.copy(), requires_grad=True)
+    self.W = Tensor(W_init.copy(), requires_grad=True)
+    self.m = Tensor(m_init.copy())
+
+  def forward(self):
+    out = self.x.matmul(self.W).relu()
+    out = out.log_softmax(1)
+    out = out.mul(self.m).add(self.m).sum()
+    return out
+
+class TinyNetTF:
+  def __init__(self):
+    self.x = tf.Variable(x_init.copy(), trainable=True)
+    self.W = tf.Variable(W_init.copy(), trainable=True)
+    self.m = tf.constant(m_init.copy())
+
+  def forward(self):
+    out = tf.matmul(self.x, self.W)
+    out = tf.nn.relu(out)
+    out = tf.nn.log_softmax(out, axis=1)
+    out = tf.multiply(out, self.m) + self.m
+    out = tf.reduce_sum(out)
+    return out
+
+def step(optim, steps=1, kwargs={}):
+  net = TinyNet()
+  optim = optim([net.x, net.W], **kwargs)
+  for _ in range(steps):
+    out = net.forward()
+    optim.zero_grad()
+    out.backward()
+    optim.step()
+  return net.x.detach().numpy(), net.W.detach().numpy()
+
+def step_tf(optim, steps=1, kwargs={}):
+  net = TinyNetTF()
+  optim = optim(**kwargs)
+  for _ in range(steps):
+    with tf.GradientTape() as tape:
+      out = net.forward()
+    grads = tape.gradient(out, [net.x, net.W])
+    optim.apply_gradients(zip(grads, [net.x, net.W]))
+  return net.x.numpy(), net.W.numpy()
+
+class ExternalTestOptim(unittest.TestCase):
+  def _test_optim(self, tinygrad_optim, tensorflow_optim, steps, opts, atol, rtol):
+    for x,y in zip(step(tinygrad_optim, steps, kwargs=opts),
+                   step_tf(tensorflow_optim, steps, kwargs=opts)):
+      np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
+
+  def _test_lamb(self, steps, opts, atol, rtol): self._test_optim(LAMB, tfa.optimizers.LAMB, steps, opts, atol, rtol)
+
+  def test_lamb(self): self._test_lamb(1, {'lr': 0.001}, 1e-5, 0)
+  def test_lamb_high_lr(self): self._test_lamb(1, {'lr': 10}, 1e-5, 1e-5)
+
+  def test_multistep_lamb(self): self._test_lamb(10, {'lr': 0.001}, 1e-5, 0)
+  def test_multistep_lamb_high_lr(self): self._test_lamb(10, {'lr': 10}, 1e-5, 3e-4)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_speed_llama.py
+++ b/tinygrad_repo/test/external/external_test_speed_llama.py
@@ -0,0 +1,57 @@
+# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
+import unittest, time
+import numpy as np
+from examples.llama import Transformer, MODEL_PARAMS
+from test.test_net_speed import start_profile, stop_profile
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+from tinygrad.nn.state import get_state_dict
+from tinygrad.ops import Compiled
+from tinygrad.helpers import dtypes, prod
+from tinygrad.runtime.lib import RawBuffer
+
+class FakeProgram:
+  def __init__(self, name:str, prg:str): pass
+  def __call__(self, *bufs, global_size, local_size, wait=False): pass
+
+class RawFakeBuffer(RawBuffer):
+  @classmethod
+  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
+  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
+
+class TestLLaMASpeed(unittest.TestCase):
+  @unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
+  def test_llama_compile(self):
+    backup_program = Device[Device.DEFAULT].runtime
+    backup_buffer = Device[Device.DEFAULT].buffer
+    Device[Device.DEFAULT].runtime = FakeProgram
+    Device[Device.DEFAULT].buffer = RawFakeBuffer
+
+    print("testing llama python run time")
+    model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
+    print("built model")
+    # assign fake tensors to the values
+    for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
+    print("assigned empty tensors, doing warmup")
+
+    def run_llama(st, empty_method_cache=True):
+      if empty_method_cache: Device[Device.DEFAULT].method_cache.clear()
+      tms = [time.perf_counter()]
+      for i in range(10):
+        model(Tensor([[2]]), i).realize()
+        tms.append(time.perf_counter())
+      timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
+      print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
+
+    run_llama("codegen")
+    run_llama("methodcache", False)
+
+    pr = start_profile()
+    run_llama("profile")
+    stop_profile(pr, sort='time', frac=0.1)
+
+    Device[Device.DEFAULT].runtime = backup_program
+    Device[Device.DEFAULT].buffer = backup_buffer
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_uops_graphing.py
+++ b/tinygrad_repo/test/external/external_test_uops_graphing.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.tensor import Tensor
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.renderer.opencl import OpenCLRenderer
+from tinygrad.graph import graph_uops
+from tinygrad.nn import Conv2d
+
+class TestUopsGraph(unittest.TestCase):
+  def test_matmul(self):
+    N = 1024
+    a = Tensor.rand(N,N)
+    b = Tensor.rand(N,N)
+    si = (a@b).lazydata.schedule()[-1]
+    lin = Linearizer(si.ast)
+    lin.hand_coded_optimizations()
+    print(lin.colored_shape())
+    uops = lin.linearize().uops
+    graph_uops(uops)
+    for u in uops: print(u)
+    print(OpenCLRenderer("matmul", uops)[0])
+
+  def test_reduce(self):
+    a = Tensor.rand(1024*1024)
+    si = a.sum().lazydata.schedule()[-1]
+    lin = Linearizer(si.ast)
+    lin.hand_coded_optimizations()
+    uops = lin.linearize().uops
+    graph_uops(uops)
+    #print(OpenCLRenderer("reduce", uops)[0])
+
+  def test_conv(self):
+    x = Tensor.rand(1,3,16,16)
+    c = Conv2d(3, 16, (3,3))
+    si = c(x).elu().lazydata.schedule()[-1]
+    lin = Linearizer(si.ast)
+    lin.hand_coded_optimizations()
+    uops = lin.linearize().uops
+    graph_uops(uops)
+    print(lin.colored_shape())
+    print(OpenCLRenderer("conv", uops)[0])
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_yolo.py
+++ b/tinygrad_repo/test/external/external_test_yolo.py
@@ -0,0 +1,36 @@
+import io
+import unittest
+from pathlib import Path
+
+import cv2
+import requests  # type: ignore
+import numpy as np
+
+from tinygrad.tensor import Tensor
+from examples.yolov3 import Darknet, infer, show_labels
+from extra.utils import fetch
+
+chicken_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/Chicken.jpg'))
+car_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/car.jpg'))
+
+class TestYOLO(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"))
+    print("Loading weights file (237MB). This might take a while…")
+    cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
+    self.assertEqual(labels, ["bird"])
+
+  def test_car(self):
+    labels = show_labels(infer(self.model, car_img))
+    self.assertEqual(labels, ["car"])
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/external_test_yolov8.py
+++ b/tinygrad_repo/test/external/external_test_yolov8.py
@@ -0,0 +1,76 @@
+import numpy as np
+from extra.utils import fetch, download_file, get_child
+from examples.yolov8 import YOLOv8, get_variant_multiples, preprocess, postprocess, label_predictions
+from pathlib import Path
+import unittest
+import io, cv2, os
+import onnxruntime as ort
+import ultralytics
+from tinygrad.nn.state import safe_load, load_state_dict
+
+class TestYOLOv8(unittest.TestCase):
+  def test_all_load_weights(self):
+    for variant in ['n', 's', 'm', 'l', 'x']:
+      weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
+      download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{variant}.safetensors', weights_location)
+
+      depth, width, ratio = get_variant_multiples(variant)
+      TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+      state_dict = safe_load(weights_location)
+      load_state_dict(TinyYolov8, state_dict)
+      print(f'successfully loaded weights for yolov{variant}')
+
+  def test_predictions(self):
+    test_image_urls = ['https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg', 'https://www.aljazeera.com/wp-content/uploads/2022/10/2022-04-28T192650Z_1186456067_UP1EI4S1I0P14_RTRMADP_3_SOCCER-ENGLAND-MUN-CHE-REPORT.jpg']
+    variant = 'n'
+    weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
+    depth, width, ratio = get_variant_multiples(variant)
+    TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+    state_dict = safe_load(weights_location)
+    load_state_dict(TinyYolov8, state_dict)
+
+    for i in range(len(test_image_urls)):
+      img_stream = io.BytesIO(fetch(test_image_urls[i]))
+      img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
+      test_image = preprocess([img])
+      predictions = TinyYolov8(test_image)
+      post_predictions = postprocess(preds=predictions, img=test_image, orig_imgs=[img])
+      labels = label_predictions(post_predictions)
+      assert labels == {5: 1, 0: 4, 11: 1} if i == 0 else labels == {0: 13, 29: 1, 32: 1}
+
+  def test_forward_pass_torch_onnx(self):
+    variant = 'n'
+    weights_location_onnx = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.onnx'
+    weights_location_pt = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.pt'
+    weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
+
+    download_file(f'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8{variant}.pt', weights_location_pt)
+    # the ultralytics export prints a lot of unneccesary things
+    if not weights_location_onnx.is_file():
+      model = ultralytics.YOLO(model=weights_location_pt, task='Detect')
+      model.export(format="onnx",imgsz=[640, 480])
+
+    depth, width, ratio = get_variant_multiples(variant)
+    TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+    state_dict = safe_load(weights_location)
+    load_state_dict(TinyYolov8, state_dict)
+
+    image_location = [np.frombuffer(io.BytesIO(fetch('https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg')).read(), np.uint8)]
+    orig_image = [cv2.imdecode(image_location[0], 1)]
+
+    input_image = preprocess(orig_image)
+
+    onnx_session = ort.InferenceSession(weights_location_onnx)
+    onnx_input_name = onnx_session.get_inputs()[0].name
+    onnx_output_name = onnx_session.get_outputs()[0].name
+    onnx_output = onnx_session.run([onnx_output_name], {onnx_input_name: input_image.numpy()})
+
+    tiny_output = TinyYolov8(input_image)
+
+    # currently rtol is 0.025 because there is a 1-2% difference in our predictions
+    # because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch.
+    # This difference does not make a difference "visually".
+    np.testing.assert_allclose(onnx_output[0], tiny_output.numpy(), atol=5e-4, rtol=0.025)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/fuzz_shapetracker.py
+++ b/tinygrad_repo/test/external/fuzz_shapetracker.py
@@ -0,0 +1,61 @@
+import random
+from tinygrad.helpers import DEBUG
+from test.unit.test_shapetracker import CheckingShapeTracker
+random.seed(42)
+
+def do_permute(st):
+  perm = list(range(0, len(st.shape)))
+  random.shuffle(perm)
+  perm = tuple(perm)
+  if DEBUG >= 1: print("st.permute(", perm, ")")
+  st.permute(perm)
+
+def do_pad(st):
+  c = random.randint(0, len(st.shape)-1)
+  pad = tuple((random.randint(0,2), random.randint(0,2)) if i==c else (0,0) for i in range(len(st.shape)))
+  if DEBUG >= 1: print("st.pad(", pad, ")")
+  st.pad(pad)
+
+def do_reshape_split_one(st):
+  c = random.randint(0, len(st.shape)-1)
+  poss = [n for n in [1,2,3,4,5] if st.shape[c]%n == 0]
+  spl = random.choice(poss)
+  shp = st.shape[0:c] + (st.shape[c]//spl, spl) + st.shape[c+1:]
+  if DEBUG >= 1: print("st.reshape(", shp, ")")
+  st.reshape(shp)
+
+def do_reshape_combine_two(st):
+  if len(st.shape) < 2: return
+  c = random.randint(0, len(st.shape)-2)
+  shp = st.shape[:c] + (st.shape[c] * st.shape[c+1], ) + st.shape[c+2:]
+  if DEBUG >= 1: print("st.reshape(", shp, ")")
+  st.reshape(shp)
+
+def do_shrink(st):
+  c = random.randint(0, len(st.shape)-1)
+  while 1:
+    shrink = tuple((random.randint(0,s), random.randint(0,s)) if i == c else (0,s) for i,s in enumerate(st.shape))
+    if all(x<y for (x,y) in shrink): break
+  if DEBUG >= 1: print("st.shrink(", shrink, ")")
+  st.shrink(shrink)
+
+def do_stride(st):
+  c = random.randint(0, len(st.shape)-1)
+  stride = tuple(random.choice([-2,-1,2]) if i==c else 1 for i in range(len(st.shape)))
+  if DEBUG >= 1: print("st.stride(", stride, ")")
+  st.stride(stride)
+
+def do_expand(st):
+  c = [i for i,s in enumerate(st.shape) if s==1]
+  if len(c) == 0: return
+  c = random.choice(c)
+  expand = tuple(random.choice([2,3,4]) if i==c else s for i,s in enumerate(st.shape))
+  if DEBUG >= 1: print("st.expand(", expand, ")")
+  st.expand(expand)
+
+if __name__ == "__main__":
+  ops = [do_permute, do_pad, do_shrink, do_reshape_split_one, do_reshape_combine_two, do_stride, do_expand]
+  for _ in range(200):
+    st = CheckingShapeTracker((random.randint(2, 10), random.randint(2, 10), random.randint(2, 10)))
+    for i in range(8): random.choice(ops)(st)
+    st.assert_same()
--- a/tinygrad_repo/test/external/fuzz_symbolic.py
+++ b/tinygrad_repo/test/external/fuzz_symbolic.py
@@ -0,0 +1,69 @@
+import itertools
+import random
+from tinygrad.helpers import DEBUG
+from tinygrad.shape.symbolic import Variable
+random.seed(42)
+
+def add_v(expr, rng=None):
+  if rng is None: rng = random.randint(0,2)
+  return expr + v[rng], rng
+
+def div(expr, rng=None):
+  if rng is None: rng = random.randint(1,9)
+  return expr // rng, rng
+
+def mul(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr * rng, rng
+
+def mod(expr, rng=None):
+  if rng is None: rng = random.randint(1,9)
+  return expr % rng, rng
+
+def add_num(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr + rng, rng
+
+def lt(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr < rng, rng
+
+def ge(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr >= rng, rng
+
+def le(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr <= rng, rng
+
+def gt(expr, rng=None):
+  if rng is None: rng = random.randint(-4,4)
+  return expr > rng, rng
+
+if __name__ == "__main__":
+  ops = [add_v, div, mul, add_num, mod]
+  for _ in range(1000):
+    upper_bounds = [*list(range(1, 10)), 16, 32, 64, 128, 256]
+    u1 = Variable("v1", 0, random.choice(upper_bounds))
+    u2 = Variable("v2", 0, random.choice(upper_bounds))
+    u3 = Variable("v3", 0, random.choice(upper_bounds))
+    v = [u1,u2,u3]
+    tape = [random.choice(ops) for _ in range(random.randint(2, 30))]
+    # 10% of the time, add one of lt, le, gt, ge
+    if random.random() < 0.1: tape.append(random.choice([lt, le, gt, ge]))
+    expr = Variable.num(0)
+    rngs = []
+    for t in tape:
+      expr, rng = t(expr)
+      if DEBUG >= 1: print(t.__name__, rng)
+      rngs.append(rng)
+    if DEBUG >=1: print(expr)
+    space = list(itertools.product(range(u1.min, u1.max+1), range(u2.min, u2.max+1), range(u3.min, u3.max+1)))
+    volume = len(space)
+    for (v1, v2, v3) in random.sample(space, min(100, volume)):
+      v = [v1,v2,v3]
+      rn = 0
+      for t,r in zip(tape, rngs): rn, _ = t(rn, r)
+      num = eval(expr.render())
+      assert num == rn, f"mismatched {expr.render()} at {v1=} {v2=} {v3=} = {num} != {rn}"
+      if DEBUG >= 1: print(f"matched {expr.render()} at {v1=} {v2=} {v3=} = {num} == {rn}")
--- a/tinygrad_repo/test/external/graph_batchnorm.py
+++ b/tinygrad_repo/test/external/graph_batchnorm.py
@@ -0,0 +1,61 @@
+import unittest
+from tinygrad.nn.state import get_parameters
+from tinygrad.tensor import Tensor
+from tinygrad.nn import Conv2d, BatchNorm2d, optim
+
+def model_step(lm):
+  with Tensor.train():
+    x = Tensor.ones(8,12,128,256, requires_grad=False)
+    optimizer = optim.SGD(get_parameters(lm), lr=0.001)
+    loss = lm.forward(x).sum()
+    optimizer.zero_grad()
+    loss.backward()
+    del x,loss
+    optimizer.step()
+
+class TestBatchnorm(unittest.TestCase):
+  def test_conv(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
+      def forward(self, x):
+        return self.c(x).relu()
+    lm = LilModel()
+    model_step(lm)
+
+  def test_two_conv(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
+        self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
+      def forward(self, x):
+        return self.c2(self.c(x)).relu()
+    lm = LilModel()
+    model_step(lm)
+
+  def test_two_conv_bn(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 24, 3, padding=1, bias=False)
+        self.bn = BatchNorm2d(24, track_running_stats=False)
+        self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
+        self.bn2 = BatchNorm2d(32, track_running_stats=False)
+      def forward(self, x):
+        x = self.bn(self.c(x)).relu()
+        return self.bn2(self.c2(x)).relu()
+    lm = LilModel()
+    model_step(lm)
+
+  def test_conv_bn(self):
+    class LilModel:
+      def __init__(self):
+        self.c = Conv2d(12, 32, 3, padding=1, bias=False)
+        self.bn = BatchNorm2d(32, track_running_stats=False)
+      def forward(self, x):
+        return self.bn(self.c(x)).relu()
+    lm = LilModel()
+    model_step(lm)
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/external/test_example.py
+++ b/tinygrad_repo/test/external/test_example.py
@@ -0,0 +1,74 @@
+import unittest
+import numpy as np
+from tinygrad.ops import Device
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import getenv, CI
+
+def multidevice_test(fxn):
+  exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
+  def ret(self):
+    for device in Device._buffers:
+      if device in ["DISK", "SHM", "FAKE"]: continue
+      if not CI: print(device)
+      if device in exclude_devices:
+        if not CI: print(f"WARNING: {device} test is excluded")
+        continue
+      with self.subTest(device=device):
+        try:
+          Device[device]
+        except Exception:
+          if not CI: print(f"WARNING: {device} test isn't running")
+          continue
+        fxn(self, device)
+  return ret
+
+class TestExample(unittest.TestCase):
+  @multidevice_test
+  def test_convert_to_cpu(self, device):
+    a = Tensor([[1,2],[3,4]], device=device)
+    assert a.numpy().shape == (2,2)
+    b = a.cpu()
+    assert b.numpy().shape == (2,2)
+
+  @multidevice_test
+  def test_2_plus_3(self, device):
+    a = Tensor([2], device=device)
+    b = Tensor([3], device=device)
+    result = a + b
+    print(f"{a.numpy()} + {b.numpy()} = {result.numpy()}")
+    assert result.numpy()[0] == 5.
+
+  @multidevice_test
+  def test_example_readme(self, device):
+    x = Tensor.eye(3, device=device, requires_grad=True)
+    y = Tensor([[2.0,0,-2.0]], device=device, requires_grad=True)
+    z = y.matmul(x).sum()
+    z.backward()
+
+    x.grad.numpy()  # dz/dx
+    y.grad.numpy()  # dz/dy
+
+    assert x.grad.device == device
+    assert y.grad.device == device
+
+  @multidevice_test
+  def test_example_matmul(self, device):
+    try:
+      Device[device]
+    except Exception:
+      print(f"WARNING: {device} test isn't running")
+      return
+
+    x = Tensor.eye(64, device=device, requires_grad=True)
+    y = Tensor.eye(64, device=device, requires_grad=True)
+    z = y.matmul(x).sum()
+    z.backward()
+
+    x.grad.numpy()  # dz/dx
+    y.grad.numpy()  # dz/dy
+
+    assert x.grad.device == device
+    assert y.grad.device == device
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/extra/test_export_model.py
+++ b/tinygrad_repo/test/extra/test_export_model.py
@@ -0,0 +1,50 @@
+import unittest
+from extra.export_model import export_model, EXPORT_SUPPORTED_DEVICE
+from tinygrad.tensor import Tensor, Device
+import json
+
+class MockMultiInputModel:
+  def forward(self, x1, x2, x3):
+    return x1 + x2 + x3
+
+class MockMultiOutputModel:
+  def __call__(self, x1):
+    return x1 + 2.0, x1.pad(((0, 0), (0, 1))) + 1.0
+
+# TODO: move compile_efficientnet tests here
+@unittest.skipUnless(Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, f"Model export is not supported on {Device.DEFAULT}")
+class TextModelExport(unittest.TestCase):
+  def test_multi_input_model_export(self):
+    model = MockMultiInputModel()
+    inputs = [Tensor.rand(2,2), Tensor.rand(2,2), Tensor.rand(2,2)]
+    prg, inp_sizes, _, _ = export_model(model, "", *inputs)
+    prg = json.loads(prg)
+
+    assert len(inputs) == len(prg["inputs"]) == len(inp_sizes), f"Model and exported inputs don't match: mdl={len(inputs)}, prg={len(prg['inputs'])}, inp_sizes={len(inp_sizes)}"
+
+    for i in range(len(inputs)):
+      assert f"input{i}" in inp_sizes, f"input{i} not captured in inp_sizes"
+      assert f"input{i}" in prg["buffers"], f"input{i} not captured in exported buffers"
+
+    for i, exported_input in enumerate(prg["inputs"]):
+      assert inputs[i].dtype.name == exported_input["dtype"], f"Model and exported input dtype don't match: mdl={inputs[i].dtype.name}, prg={exported_input['dtype']}"
+
+  def test_multi_output_model_export(self):
+    model = MockMultiOutputModel()
+    input = Tensor.rand(2,2)
+    outputs = model(input)
+    prg, _, out_sizes, _ = export_model(model, "", input)
+    prg = json.loads(prg)
+
+    assert len(outputs) == len(prg["outputs"]) == len(out_sizes), f"Model and exported outputs don't match: mdl={len(outputs)}, prg={len(prg['outputs'])}, inp_sizes={len(out_sizes)}"
+
+    for i in range(len(outputs)):
+      assert f"output{i}" in out_sizes, f"output{i} not captured in out_sizes"
+      assert f"output{i}" in prg["buffers"], f"output{i} not captured in exported buffers"
+
+    for i, exported_output in enumerate(prg["outputs"]):
+      assert outputs[i].dtype.name == exported_output["dtype"], f"Model and exported output dtype don't match: mdl={outputs[i].dtype.name}, prg={exported_output['dtype']}"
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/extra/test_extra_helpers.py
+++ b/tinygrad_repo/test/extra/test_extra_helpers.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+import os, cloudpickle, tempfile, unittest, subprocess
+from extra.helpers import enable_early_exec, cross_process, _CloudpickleFunctionWrapper
+
+def normalize_line_endings(s): return s.replace(b'\r\n', b'\n')
+
+class TestEarlyExec(unittest.TestCase):
+  def setUp(self) -> None:
+    self.early_exec = enable_early_exec()
+
+  def early_exec_py_file(self, file_content, exec_args):
+    with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp:
+      temp.write(file_content)
+      temp_path = temp.name
+    try:
+      output = self.early_exec((["python3", temp_path] + exec_args, None))
+      return output
+    finally:
+      os.remove(temp_path)
+
+  def test_enable_early_exec(self):
+    output = self.early_exec_py_file(b'print("Hello, world!")', [])
+    self.assertEqual(b"Hello, world!\n", normalize_line_endings(output))
+
+  def test_enable_early_exec_with_arg(self):
+    output = self.early_exec_py_file(b'import sys\nprint("Hello, " + sys.argv[1] + "!")', ["world"])
+    self.assertEqual(b"Hello, world!\n", normalize_line_endings(output))
+
+  def test_enable_early_exec_process_exception(self):
+    with self.assertRaises(subprocess.CalledProcessError):
+      self.early_exec_py_file(b'raise Exception("Test exception")', [])
+
+  def test_enable_early_exec_type_exception(self):
+    with self.assertRaises(TypeError):
+      self.early_exec((["python3"], "print('Hello, world!')"))
+
+class TestCrossProcess(unittest.TestCase):
+
+  def test_cross_process(self):
+    def _iterate():
+      for i in range(10): yield i
+    results = list(cross_process(_iterate))
+    self.assertEqual(list(range(10)), results)
+
+  def test_cross_process_exception(self):
+    def _iterate():
+      for i in range(10):
+        if i == 5: raise ValueError("Test exception")
+        yield i
+    with self.assertRaises(ValueError): list(cross_process(_iterate))
+
+  def test_CloudpickleFunctionWrapper(self):
+    def add(x, y): return x + y
+    self.assertEqual(7, cloudpickle.loads(cloudpickle.dumps(_CloudpickleFunctionWrapper(add)))(3, 4))
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/extra/test_lr_scheduler.py
+++ b/tinygrad_repo/test/extra/test_lr_scheduler.py
@@ -0,0 +1,107 @@
+import numpy as np
+import torch
+import unittest
+from tinygrad.tensor import Tensor
+from tinygrad.nn.state import get_parameters
+from tinygrad.nn.optim import Adam
+from extra.lr_scheduler import MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR, OneCycleLR
+from extra.training import train, evaluate
+from extra.datasets import fetch_mnist
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu]
+
+np.random.seed(1337)
+Tensor.manual_seed(1337)
+
+X_train, Y_train, X_test, Y_test = fetch_mnist()
+
+class TinyBobNet:
+  def __init__(self):
+    self.l1 = Tensor.scaled_uniform(784, 128)
+    self.l2 = Tensor.scaled_uniform(128, 10)
+
+  def parameters(self):
+    return get_parameters(self)
+
+  def forward(self, x):
+    return x.dot(self.l1).relu().dot(self.l2).log_softmax()
+
+def lr_scheduler_training(sched_fn=None, args=None):
+  model = TinyBobNet()
+  optim = Adam(model.parameters(), lr=0.01)
+  if sched_fn is not None: sched = sched_fn(optim, **args)
+  for _ in range(25):
+    train(model, X_train, Y_train, optim, 100)
+    if sched_fn is not None:
+      if isinstance(sched, ReduceLROnPlateau):
+        sched.step(evaluate(model, X_test, Y_test))
+      else:
+        sched.step()
+  return evaluate(model, X_test, Y_test)
+
+def current_lr(optim): return optim.param_groups[0]['lr'] if hasattr(optim, 'param_groups') else optim.lr
+def get_lrs(optim, sched, epochs, steps=1, accs=None):
+  lr = current_lr(optim)
+  if not isinstance(lr, float): lr = lr.numpy()[0]
+  lrs = [lr]
+  for e in range(epochs):
+    for _ in range(steps):
+      optim.step()
+    sched.step() if accs is None else sched.step(accs[e])
+    lr = current_lr(optim)
+    if not isinstance(lr, float): lr = lr.numpy()[0]
+    lrs.append(lr)
+  return lrs
+
+class TestLrScheduler(unittest.TestCase):
+  def _test_lr_scheduler(self, tinygrad_sched, torch_sched, epochs, opts, atol, rtol):
+    accs = opts.pop('accs', None)
+    tinygrad_optim, torch_optim = Adam([], lr=0.01), torch.optim.Adam([torch.tensor([0.], requires_grad=True)], lr=0.01)
+    tinygrad_sched, torch_sched = tinygrad_sched(tinygrad_optim, **opts), torch_sched(torch_optim, **opts)
+
+    tinygrad_lrs = get_lrs(tinygrad_optim, tinygrad_sched, epochs, accs=accs)
+    torch_lrs = get_lrs(torch_optim, torch_sched, epochs, accs=accs)
+
+    np.testing.assert_allclose(tinygrad_lrs, torch_lrs, atol=atol, rtol=rtol)
+
+  def _test_multisteplr(self, epochs, opts, atol, rtol):
+    self._test_lr_scheduler(MultiStepLR, torch.optim.lr_scheduler.MultiStepLR, epochs, opts, atol, rtol)
+  def _test_reducelronplateau(self, epochs, opts, atol, rtol):
+    opts['accs'] = np.random.randn(epochs)
+    self._test_lr_scheduler(ReduceLROnPlateau, torch.optim.lr_scheduler.ReduceLROnPlateau, epochs, opts, atol, rtol)
+  def _test_cosineannealinglr(self, epochs, opts, atol, rtol):
+    opts['T_max'] = epochs
+    self._test_lr_scheduler(CosineAnnealingLR, torch.optim.lr_scheduler.CosineAnnealingLR, epochs, opts, atol, rtol)
+  def _test_onecyclelr(self, epochs, opts, atol, rtol):
+    opts['total_steps'] = epochs
+    self._test_lr_scheduler(OneCycleLR, torch.optim.lr_scheduler.OneCycleLR, epochs, opts, atol, rtol)
+
+  def test_multisteplr(self): self._test_multisteplr(10, {'milestones': [1, 2, 7]}, 1e-6, 1e-6)
+  def test_multisteplr_gamma(self): self._test_multisteplr(10, {'milestones': [1, 2, 7], 'gamma': 0.1337}, 1e-6, 1e-6)
+
+  def test_reducelronplateau(self): self._test_reducelronplateau(100, {}, 1e-6, 1e-6)
+  def test_reducelronplateau_max(self): self._test_reducelronplateau(100, {'mode': 'max'}, 1e-6, 1e-6)
+  def test_reducelronplateau_factor(self): self._test_reducelronplateau(100, {'factor': 0.1337}, 1e-6, 1e-6)
+  def test_reducelronplateau_patience(self): self._test_reducelronplateau(100, {'patience': 3}, 1e-6, 1e-6)
+  def test_reducelronplateau_threshold(self): self._test_reducelronplateau(100, {'threshold': 1e-6}, 1e-6, 1e-6)
+  def test_reducelronplateau_threshold_mode(self): self._test_reducelronplateau(100, {'threshold_mode': 'abs'}, 1e-6, 1e-6)
+
+  def test_cosineannealinglr(self): self._test_cosineannealinglr(100, {}, 1e-6, 1e-6)
+  def test_cosineannealinglr_eta_min(self): self._test_cosineannealinglr(100, {'eta_min': 0.001}, 1e-6, 1e-6)
+
+  def test_onecyclelr(self): self._test_onecyclelr(1000, {'pct_start': 0.3, 'anneal_strategy': 'linear',
+                                                         'cycle_momentum': False, 'div_factor': 25.0,
+                                                         'final_div_factor': 10000.0, 'max_lr':1e-5}, 1e-6, 1e-6)
+  @unittest.skip("slow")
+  def test_training(self):
+    without = lr_scheduler_training()
+    sched_fns = [MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR, OneCycleLR]
+    argss = [{'milestones': [5, 7, 10, 15], 'gamma': 0.5}, {'factor': 0.5, 'patience': 2}, {'T_max': 25, 'eta_min': 0.001},
+             {'pct_start': 0.3, 'anneal_strategy': 'linear', 'cycle_momentum': False, 'div_factor': 25.0, 'final_div_factor': 10000.0, 'max_lr':1e-5, 'total_steps': 25}]
+    for sched_fn, args in zip(sched_fns, argss):
+      with_sched = lr_scheduler_training(sched_fn, args)
+      assert with_sched > without
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/extra/test_utils.py
+++ b/tinygrad_repo/test/extra/test_utils.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+import io, unittest
+import os
+import tempfile
+from unittest.mock import patch, MagicMock
+
+import torch
+import numpy as np
+from tinygrad.helpers import CI
+from extra.utils import fetch, temp, download_file
+from tinygrad.nn.state import torch_load
+from PIL import Image
+
+@unittest.skipIf(CI, "no internet tests in CI")
+class TestFetch(unittest.TestCase):
+  def test_fetch_bad_http(self):
+    self.assertRaises(AssertionError, fetch, 'http://httpstat.us/500')
+    self.assertRaises(AssertionError, fetch, 'http://httpstat.us/404')
+    self.assertRaises(AssertionError, fetch, 'http://httpstat.us/400')
+
+  def test_fetch_small(self):
+    assert(len(fetch('https://google.com'))>0)
+
+  def test_fetch_img(self):
+    img = fetch("https://media.istockphoto.com/photos/hen-picture-id831791190")
+    pimg = Image.open(io.BytesIO(img))
+    assert pimg.size == (705, 1024)
+
+class TestFetchRelative(unittest.TestCase):
+  def setUp(self):
+    self.working_dir = os.getcwd()
+    self.tempdir = tempfile.TemporaryDirectory()
+    os.chdir(self.tempdir.name)
+    with open('test_file.txt', 'x') as f:
+      f.write("12345")
+
+  def tearDown(self):
+    os.chdir(self.working_dir)
+    self.tempdir.cleanup()
+
+  #test ./
+  def test_fetch_relative_dotslash(self):
+    self.assertEqual(b'12345', fetch("./test_file.txt"))
+
+  #test ../
+  def test_fetch_relative_dotdotslash(self):
+    os.mkdir('test_file_path')
+    os.chdir('test_file_path')
+    self.assertEqual(b'12345', fetch("../test_file.txt"))
+
+class TestDownloadFile(unittest.TestCase):
+  def setUp(self):
+    from pathlib import Path
+    self.test_file = Path(temp("test_download_file/test_file.txt"))
+
+  def tearDown(self):
+    os.remove(self.test_file)
+    os.removedirs(self.test_file.parent)
+
+  @patch('requests.get')
+  def test_download_file_with_mkdir(self, mock_requests):
+    mock_response = MagicMock()
+    mock_response.iter_content.return_value = [b'1234', b'5678']
+    mock_response.status_code = 200
+    mock_response.headers = {'content-length': '8'}
+    mock_requests.return_value = mock_response
+    self.assertFalse(self.test_file.parent.exists())
+    download_file("https://www.mock.com/fake.txt", self.test_file, skip_if_exists=False)
+    self.assertTrue(self.test_file.parent.exists())
+    self.assertTrue(self.test_file.is_file())
+    self.assertEqual('12345678', self.test_file.read_text())
+
+class TestUtils(unittest.TestCase):
+  def test_fake_torch_load_zipped(self): self._test_fake_torch_load_zipped()
+  def test_fake_torch_load_zipped_float16(self): self._test_fake_torch_load_zipped(isfloat16=True)
+  def _test_fake_torch_load_zipped(self, isfloat16=False):
+    class LayerWithOffset(torch.nn.Module):
+      def __init__(self):
+        super(LayerWithOffset, self).__init__()
+        d = torch.randn(16)
+        self.param1 = torch.nn.Parameter(
+          d.as_strided([2, 2], [1, 2], storage_offset=5)
+        )
+        self.param2 = torch.nn.Parameter(
+          d.as_strided([2, 2], [1, 2], storage_offset=4)
+        )
+
+    model = torch.nn.Sequential(
+      torch.nn.Linear(4, 8),
+      torch.nn.Linear(8, 3),
+      LayerWithOffset()
+    )
+    if isfloat16: model = model.half()
+
+    path = temp(f"test_load_{isfloat16}.pt")
+    torch.save(model.state_dict(), path)
+    model2 = torch_load(path)
+
+    for name, a in model.state_dict().items():
+      b = model2[name]
+      a, b = a.numpy(), b.numpy()
+      assert a.shape == b.shape
+      assert a.dtype == b.dtype
+      assert np.array_equal(a, b)
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/helpers.py
+++ b/tinygrad_repo/test/helpers.py
@@ -0,0 +1,15 @@
+from tinygrad.ops import LazyOp, LoadOps
+from tinygrad.nn.state import get_parameters
+
+# for speed
+def derandomize(x):
+  if isinstance(x, LazyOp):
+    new_op = LoadOps.EMPTY if x.op == LoadOps.RAND else x.op
+    return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), x.arg)
+  x.op = derandomize(x.op)
+  return x
+
+def derandomize_model(model):
+  for p in get_parameters(model):
+    p.lazydata = derandomize(p.lazydata)
+    p.realize()
--- a/tinygrad_repo/test/models/efficientnet/Chicken.jpg
+++ b/tinygrad_repo/test/models/efficientnet/Chicken.jpg
--- a/tinygrad_repo/test/models/efficientnet/car.jpg
+++ b/tinygrad_repo/test/models/efficientnet/car.jpg
--- a/tinygrad_repo/test/models/efficientnet/imagenet1000_clsidx_to_labels.txt
+++ b/tinygrad_repo/test/models/efficientnet/imagenet1000_clsidx_to_labels.txt
--- a/tinygrad_repo/test/models/test_bert.py
+++ b/tinygrad_repo/test/models/test_bert.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+import torch
+
+def get_question_samp(bsz, seq_len, vocab_size, seed):
+  np.random.seed(seed)
+  in_ids= np.random.randint(vocab_size, size=(bsz, seq_len))
+  mask = np.random.choice([True, False], size=(bsz, seq_len))
+  seg_ids = np.random.randint(1, size=(bsz, seq_len))
+  return in_ids, mask, seg_ids
+
+def set_equal_weights(mdl, torch_mdl):
+  from tinygrad.nn.state import get_state_dict
+  state, torch_state = get_state_dict(mdl), torch_mdl.state_dict()
+  assert len(state) == len(torch_state)
+  for k, v in state.items():
+    assert k in torch_state
+    torch_state[k].copy_(torch.from_numpy(v.numpy()))
+  torch_mdl.eval()
+
+class TestBert(unittest.TestCase):
+  def test_questions(self):
+    from models.bert import BertForQuestionAnswering
+    from transformers import BertForQuestionAnswering as TorchBertForQuestionAnswering
+    from transformers import BertConfig
+
+    # small
+    config = {
+      'vocab_size':24, 'hidden_size':2, 'num_hidden_layers':2, 'num_attention_heads':2,
+      'intermediate_size':32, 'hidden_dropout_prob':0.1, 'attention_probs_dropout_prob':0.1,
+      'max_position_embeddings':512, 'type_vocab_size':2
+      }
+
+    # Create in tinygrad
+    Tensor.manual_seed(1337)
+    mdl = BertForQuestionAnswering(**config)
+
+    # Create in torch
+    with torch.no_grad():
+      torch_mdl = TorchBertForQuestionAnswering(BertConfig(**config))
+
+    set_equal_weights(mdl, torch_mdl)
+
+    seeds = (1337, 3141)
+    bsz, seq_len = 1, 16
+    for _, seed in enumerate(seeds):
+      in_ids, mask, seg_ids = get_question_samp(bsz, seq_len, config['vocab_size'], seed)
+      out = mdl(Tensor(in_ids), Tensor(mask), Tensor(seg_ids))
+      torch_out = torch_mdl.forward(torch.from_numpy(in_ids).long(), torch.from_numpy(mask), torch.from_numpy(seg_ids).long())[:2]
+      torch_out = torch.cat(torch_out).unsqueeze(2)
+      np.testing.assert_allclose(out.numpy(), torch_out.detach().numpy(), atol=5e-4, rtol=5e-4)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_efficientnet.py
+++ b/tinygrad_repo/test/models/test_efficientnet.py
@@ -0,0 +1,115 @@
+import ast
+import pathlib
+import sys
+import unittest
+
+import numpy as np
+from PIL import Image
+
+from tinygrad.helpers import getenv
+from tinygrad.tensor import Tensor
+from models.efficientnet import EfficientNet
+from models.vit import ViT
+from models.resnet import ResNet50
+
+def _load_labels():
+  labels_filename = pathlib.Path(__file__).parent / 'efficientnet/imagenet1000_clsidx_to_labels.txt'
+  return ast.literal_eval(labels_filename.read_text())
+
+_LABELS = _load_labels()
+
+def preprocess(img, new=False):
+  # preprocess image
+  aspect_ratio = img.size[0] / img.size[1]
+  img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
+
+  img = np.array(img)
+  y0, x0 =(np.asarray(img.shape)[:2] - 224) // 2
+  img = img[y0: y0 + 224, x0: x0 + 224]
+
+  # low level preprocess
+  if new:
+    img = img.astype(np.float32)
+    img -= [127.0, 127.0, 127.0]
+    img /= [128.0, 128.0, 128.0]
+    img = img[None]
+  else:
+    img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
+    img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
+    img /= 255.0
+    img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
+    img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
+  return img
+
+
+def _infer(model: EfficientNet, img, bs=1):
+  Tensor.training = False
+  img = preprocess(img)
+  # run the net
+  if bs > 1: img = img.repeat(bs, axis=0)
+  out = model.forward(Tensor(img)).cpu()
+  return _LABELS[np.argmax(out.numpy()[0])]
+
+chicken_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg')
+car_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg')
+
+class TestEfficientNet(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = EfficientNet(number=getenv("NUM"))
+    cls.model.load_from_pretrained()
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    label = _infer(self.model, chicken_img)
+    self.assertEqual(label, "hen")
+
+  def test_chicken_bigbatch(self):
+    label = _infer(self.model, chicken_img, 2)
+    self.assertEqual(label, "hen")
+
+  def test_car(self):
+    label = _infer(self.model, car_img)
+    self.assertEqual(label, "sports car, sport car")
+
+class TestViT(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = ViT()
+    cls.model.load_from_pretrained()
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    label = _infer(self.model, chicken_img)
+    self.assertEqual(label, "cock")
+
+  def test_car(self):
+    label = _infer(self.model, car_img)
+    self.assertEqual(label, "racer, race car, racing car")
+
+class TestResNet(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.model = ResNet50()
+    cls.model.load_from_pretrained()
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+
+  def test_chicken(self):
+    label = _infer(self.model, chicken_img)
+    self.assertEqual(label, "hen")
+
+  def test_car(self):
+    label = _infer(self.model, car_img)
+    self.assertEqual(label, "sports car, sport car")
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_end2end.py
+++ b/tinygrad_repo/test/models/test_end2end.py
@@ -0,0 +1,165 @@
+import torch
+from torch import nn
+import unittest
+import numpy as np
+from tinygrad.nn.state import get_parameters, get_state_dict
+from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
+from tinygrad.tensor import Tensor
+from extra.datasets import fetch_mnist
+from tinygrad.helpers import CI
+
+def compare_tiny_torch(model, model_torch, X, Y):
+  with Tensor.train():
+    model_torch.train()
+    model_state_dict = get_state_dict(model)
+    for k,v in model_torch.named_parameters():
+      if not CI: print(f"initting {k} from torch")
+      model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
+
+    optimizer = optim.SGD(get_parameters(model), lr=0.001)
+    optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.001)
+
+    Xt = torch.Tensor(X.numpy())
+    np.testing.assert_allclose(X.numpy(), Xt.detach().numpy())
+
+    out = model(X)
+    loss = (out * Y).mean()
+    if not CI: print(loss.realize().numpy())
+
+    out_torch = model_torch(torch.Tensor(X.numpy()))
+    loss_torch = (out_torch * torch.Tensor(Y.numpy())).mean()
+    if not CI: print(loss_torch.detach().numpy())
+
+    # assert losses match
+    np.testing.assert_allclose(loss.realize().numpy(), loss_torch.detach().numpy(), atol=1e-4)
+
+    # zero and backward
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer_torch.zero_grad()
+    loss_torch.backward()
+
+    for k,v in list(model_torch.named_parameters())[::-1]:
+      g = model_state_dict[k].grad.numpy()
+      gt = v.grad.detach().numpy()
+      if not CI: print("testing grads", k)
+      np.testing.assert_allclose(g, gt, atol=1e-3, err_msg=f'grad mismatch {k}')
+
+    # take the steps
+    optimizer.step()
+    optimizer_torch.step()
+
+    # assert weights match (they don't!)
+    for k,v in model_torch.named_parameters():
+      if not CI: print("testing weight", k)
+      np.testing.assert_allclose(model_state_dict[k].numpy(), v.detach().numpy(), atol=1e-3, err_msg=f'weight mismatch {k}')
+
+def get_mnist_data():
+  X_train, Y_train, X_test, Y_test = fetch_mnist()
+  BS = 32
+  num_classes = 10
+  X = Tensor(X_test[0:BS].astype(np.float32))
+  Y = np.zeros((BS, num_classes), np.float32)
+  Y[range(BS),Y_test[0:BS]] = -1.0*num_classes
+  return X, Tensor(Y)
+
+class TestEnd2End(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    cls.X, cls.Y = get_mnist_data()
+
+  def setUp(self):
+    torch.manual_seed(123)
+
+  def test_linear_mnist(self):
+    class LinTiny:
+      def __init__(self, has_batchnorm=False):
+        self.l1 = Linear(784, 128)
+        self.l2 = Linear(128, 10)
+        self.bn1 = BatchNorm2d(128) if has_batchnorm else lambda x: x
+      def __call__(self, x):
+        return self.l2(self.l1(x)).relu().log_softmax(-1)
+    class LinTorch(nn.Module):
+      def __init__(self, has_batchnorm=False):
+        super().__init__()
+        self.l1 = nn.Linear(784, 128)
+        self.l2 = nn.Linear(128, 10)
+      def forward(self, x):
+        return self.l2(self.l1(x)).relu().log_softmax(-1)
+    compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
+
+  def test_bn_mnist(self):
+    class LinTiny:
+      def __init__(self):
+        self.l1 = Linear(784, 128)
+        self.l2 = Linear(128, 10)
+        self.bn1 = BatchNorm2d(128)
+      def __call__(self, x):
+        return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
+    class LinTorch(nn.Module):
+      def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(784, 128)
+        self.l2 = nn.Linear(128, 10)
+        self.bn1 = nn.BatchNorm2d(128)
+      def forward(self, x):
+        return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
+    compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
+
+  def test_bn_alone(self):
+    np.random.seed(1337)
+    X = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
+    Y = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
+    compare_tiny_torch(BatchNorm2d(10), nn.BatchNorm2d(10), X, Y)
+
+  def test_bn_linear(self):
+    BS, K = 2, 1
+    eps = 0
+    X = Tensor([1,0]).reshape(BS, K, 1, 1)
+    Y = Tensor([-1,0]).reshape(BS, K, 1, 1)
+    class LinTiny:
+      def __init__(self):
+        self.l1 = Conv2d(K, K, 1, bias=False)
+        self.bn1 = BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
+      def __call__(self, x): return self.bn1(self.l1(x))
+    class LinTorch(nn.Module):
+      def __init__(self):
+        super().__init__()
+        self.l1 = nn.Conv2d(K, K, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
+      def forward(self, x): return self.bn1(self.l1(x))
+    model_torch = LinTorch()
+    with torch.no_grad():
+      model_torch.l1.weight[:] = 1.
+    compare_tiny_torch(LinTiny(), model_torch, X, Y)
+
+  def test_conv_mnist(self):
+    class LinTiny:
+      def __init__(self, has_batchnorm=False):
+        self.c1 = Conv2d(1, 8, 3, stride=2)
+        self.c2 = Conv2d(8, 16, 3, stride=2)
+        self.l1 = Linear(16*6*6, 10)
+        if has_batchnorm:
+          self.bn1, self.bn2 = BatchNorm2d(8), BatchNorm2d(16)
+        else:
+          self.bn1, self.bn2 = lambda x: x, lambda x: x
+      def __call__(self, x):
+        return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
+    class LinTorch(nn.Module):
+      def __init__(self, has_batchnorm=False):
+        super().__init__()
+        self.c1 = nn.Conv2d(1, 8, 3, stride=2)
+        self.c2 = nn.Conv2d(8, 16, 3, stride=2)
+        self.l1 = nn.Linear(16*6*6, 10)
+        if has_batchnorm:
+          self.bn1, self.bn2 = nn.BatchNorm2d(8), nn.BatchNorm2d(16)
+        else:
+          self.bn1, self.bn2 = lambda x: x, lambda x: x
+      def forward(self, x):
+        return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
+    for has_batchnorm in [False, True]:
+      with self.subTest(has_batchnorm=has_batchnorm):
+        compare_tiny_torch(LinTiny(has_batchnorm), LinTorch(has_batchnorm), self.X.reshape((-1, 1, 28, 28)), self.Y)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/models/test_mnist.py
+++ b/tinygrad_repo/test/models/test_mnist.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.nn.state import get_parameters
+from tinygrad.tensor import Tensor, Device
+from tinygrad.nn import optim, BatchNorm2d
+from extra.training import train, evaluate
+from extra.datasets import fetch_mnist
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+# load the mnist dataset
+X_train, Y_train, X_test, Y_test = fetch_mnist()
+
+# create a model
+class TinyBobNet:
+  def __init__(self):
+    self.l1 = Tensor.scaled_uniform(784, 128)
+    self.l2 = Tensor.scaled_uniform(128, 10)
+
+  def parameters(self):
+    return get_parameters(self)
+
+  def forward(self, x):
+    return x.dot(self.l1).relu().dot(self.l2).log_softmax()
+
+# create a model with a conv layer
+class TinyConvNet:
+  def __init__(self, has_batchnorm=False):
+    # https://keras.io/examples/vision/mnist_convnet/
+    conv = 3
+    #inter_chan, out_chan = 32, 64
+    inter_chan, out_chan = 8, 16   # for speed
+    self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
+    self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
+    self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)
+    if has_batchnorm:
+      self.bn1 = BatchNorm2d(inter_chan)
+      self.bn2 = BatchNorm2d(out_chan)
+    else:
+      self.bn1, self.bn2 = lambda x: x, lambda x: x
+
+  def parameters(self):
+    return get_parameters(self)
+
+  def forward(self, x:Tensor):
+    x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
+    x = self.bn1(x.conv2d(self.c1)).relu().max_pool2d()
+    x = self.bn2(x.conv2d(self.c2)).relu().max_pool2d()
+    x = x.reshape(shape=[x.shape[0], -1])
+    return x.dot(self.l1).log_softmax()
+
+class TestMNIST(unittest.TestCase):
+  def test_sgd_onestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=1)
+    for p in model.parameters(): p.realize()
+
+  def test_sgd_threestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=3)
+
+  def test_sgd_sixstep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=6, noloss=True)
+
+  def test_adam_onestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=1)
+    for p in model.parameters(): p.realize()
+
+  def test_adam_threestep(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=3)
+
+  def test_conv_onestep(self):
+    np.random.seed(1337)
+    model = TinyConvNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, BS=69, steps=1, noloss=True)
+    for p in model.parameters(): p.realize()
+
+  def test_conv(self):
+    np.random.seed(1337)
+    model = TinyConvNet()
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, steps=100)
+    assert evaluate(model, X_test, Y_test) > 0.93   # torch gets 0.9415 sometimes
+
+  def test_conv_with_bn(self):
+    np.random.seed(1337)
+    model = TinyConvNet(has_batchnorm=True)
+    optimizer = optim.AdamW(model.parameters(), lr=0.003)
+    train(model, X_train, Y_train, optimizer, steps=200)
+    assert evaluate(model, X_test, Y_test) > 0.94
+
+  def test_sgd(self):
+    np.random.seed(1337)
+    model = TinyBobNet()
+    optimizer = optim.SGD(model.parameters(), lr=0.001)
+    train(model, X_train, Y_train, optimizer, steps=600)
+    assert evaluate(model, X_test, Y_test) > 0.94   # CPU gets 0.9494 sometimes
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_onnx.py
+++ b/tinygrad_repo/test/models/test_onnx.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+import os
+import time
+import io
+import unittest
+import numpy as np
+import onnx
+from extra.utils import fetch, temp
+from extra.onnx import get_run_onnx
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import CI
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+def run_onnx_torch(onnx_model, inputs):
+  import torch
+  from onnx2torch import convert
+  torch_model = convert(onnx_model).float()
+  with torch.no_grad():
+    torch_out = torch_model(*[torch.tensor(x) for x in inputs.values()])
+  return torch_out
+
+OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx"
+
+np.random.seed(1337)
+
+class TestOnnxModel(unittest.TestCase):
+  def test_benchmark_openpilot_model(self):
+    dat = fetch(OPENPILOT_MODEL)
+    onnx_model = onnx.load(io.BytesIO(dat))
+    run_onnx = get_run_onnx(onnx_model)
+    def get_inputs():
+      np_inputs = {
+        "input_imgs": np.random.randn(*(1, 12, 128, 256)),
+        "big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
+        "desire": np.zeros((1, 100, 8)),
+        "traffic_convention": np.array([[1., 0.]]),
+        "nav_features": np.zeros((1, 256)),
+        "features_buffer": np.zeros((1, 99, 128)),
+    }
+      inputs = {k:Tensor(v.astype(np.float32), requires_grad=False) for k,v in np_inputs.items()}
+      return inputs
+
+    for _ in range(7):
+      inputs = get_inputs()
+      st = time.monotonic()
+      tinygrad_out = run_onnx(inputs)['outputs']
+      mt = time.monotonic()
+      tinygrad_out.realize()
+      mt2 = time.monotonic()
+      tinygrad_out = tinygrad_out.numpy()
+      et = time.monotonic()
+      if not CI: print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
+
+    if not CI:
+      import cProfile
+      import pstats
+      inputs = get_inputs()
+      pr = cProfile.Profile(timer=time.perf_counter_ns, timeunit=1e-6)
+      pr.enable()
+    tinygrad_out = run_onnx(inputs)['outputs']
+    tinygrad_out.realize()
+    tinygrad_out = tinygrad_out.numpy()
+    if not CI:
+      pr.disable()
+      stats = pstats.Stats(pr)
+      stats.dump_stats(temp("net.prof"))
+      os.system(f"flameprof {temp('net.prof')} > {temp('prof.svg')}")
+      ps = stats.sort_stats(pstats.SortKey.TIME)
+      ps.print_stats(30)
+
+  def test_openpilot_model(self):
+    dat = fetch(OPENPILOT_MODEL)
+    onnx_model = onnx.load(io.BytesIO(dat))
+    run_onnx = get_run_onnx(onnx_model)
+    print("got run_onnx")
+    inputs = {
+      "input_imgs": np.random.randn(*(1, 12, 128, 256)),
+      "big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
+      "desire": np.zeros((1, 100, 8)),
+      "traffic_convention": np.array([[1., 0.]]),
+      "nav_features": np.zeros((1, 256)),
+      "features_buffer": np.zeros((1, 99, 128)),
+    }
+    inputs = {k:v.astype(np.float32) for k,v in inputs.items()}
+
+    st = time.monotonic()
+    print("****** run onnx ******")
+    tinygrad_out = run_onnx(inputs)['outputs']
+    mt = time.monotonic()
+    print("****** realize ******")
+    tinygrad_out.realize()
+    mt2 = time.monotonic()
+    tinygrad_out = tinygrad_out.numpy()
+    et = time.monotonic()
+    print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
+
+    Tensor.no_grad = True
+    torch_out = run_onnx_torch(onnx_model, inputs).numpy()
+    Tensor.no_grad = False
+    print(tinygrad_out, torch_out)
+    np.testing.assert_allclose(torch_out, tinygrad_out, atol=1e-4, rtol=1e-2)
+
+  def test_efficientnet(self):
+    dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx")
+    input_name, input_new = "images:0", True
+    self._test_model(dat, input_name, input_new)
+
+  def test_shufflenet(self):
+    dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx")
+    print(f"shufflenet downloaded : {len(dat)/1e6:.2f} MB")
+    input_name, input_new = "gpu_0/data_0", False
+    self._test_model(dat, input_name, input_new)
+
+  @unittest.skip("test is very slow")
+  def test_resnet(self):
+    # NOTE: many onnx models can't be run right now due to max pool with strides != kernel_size
+    dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx")
+    print(f"resnet downloaded : {len(dat)/1e6:.2f} MB")
+    input_name, input_new = "data", False
+    self._test_model(dat, input_name, input_new)
+
+  def _test_model(self, dat, input_name, input_new, debug=False):
+    onnx_model = onnx.load(io.BytesIO(dat))
+    print("onnx loaded")
+    from test.models.test_efficientnet import chicken_img, car_img, preprocess, _LABELS
+    run_onnx = get_run_onnx(onnx_model)
+
+    def run(img):
+      inputs = {input_name: preprocess(img, new=input_new)}
+      tinygrad_out = list(run_onnx(inputs, debug=debug).values())[0].numpy()
+      return tinygrad_out.argmax()
+
+    cls = run(chicken_img)
+    print(cls, _LABELS[cls])
+    assert _LABELS[cls] == "hen" or _LABELS[cls] == "cock"
+    cls = run(car_img)
+    print(cls, _LABELS[cls])
+    assert "car" in _LABELS[cls] or _LABELS[cls] == "convertible"
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/models/test_real_world.py
+++ b/tinygrad_repo/test/models/test_real_world.py
@@ -0,0 +1,100 @@
+import unittest, time
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.nn import optim
+from tinygrad.nn.state import get_parameters
+from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
+from tinygrad.ops import Device, GlobalCounters
+from tinygrad.helpers import CI, dtypes, getenv, prod
+from test.helpers import derandomize_model
+
+from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
+from examples.hlb_cifar10 import SpeedyResNet
+from examples.llama import Transformer as LLaMaTransformer, MODEL_PARAMS as LLAMA_MODEL_PARAMS
+from examples.stable_diffusion import UNetModel
+
+def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed, all_jitted=False):
+  tms = []
+  for _ in range(4):
+    GlobalCounters.reset()
+    GlobalCounters.mem_used = 0
+    Device[Device.DEFAULT].synchronize()
+    st = time.perf_counter_ns()
+    train(*gen())
+    Device[Device.DEFAULT].synchronize()
+    tms.append(time.perf_counter_ns() - st)
+
+  kernels_used = len(train.jit_cache) if hasattr(train, "jit_cache") else None
+  print(f"{nm}: used {GlobalCounters.mem_used/1e9:.2f} GB and {kernels_used} kernels in {min(tms)/1e6:.2f} ms")
+  assert GlobalCounters.mem_used/1e9 < max_memory_allowed, f"{nm} used more than {max_memory_allowed:.2f} GB"
+  assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels"
+  if all_jitted:
+    assert kernels_used > 0 and kernels_used == GlobalCounters.kernel_count, f"only {kernels_used} out of {GlobalCounters.kernel_count} were jitted"
+
+class TestRealWorld(unittest.TestCase):
+  def setUp(self):
+    self.old_type = Tensor.default_type
+    np.random.seed(2002)
+
+  def tearDown(self):
+    Tensor.default_type = self.old_type
+
+  @unittest.skipUnless(not CI, "too big for CI")
+  def test_stable_diffusion(self):
+    model = UNetModel()
+    derandomize_model(model)
+    @TinyJit
+    def test(t, t2): return model(t, 801, t2).realize()
+    helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, 768)), test, 18.0, 967)
+
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
+  def test_llama(self):
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
+    model = LLaMaTransformer(**(args_tiny if CI else LLAMA_MODEL_PARAMS["1"]["7B"]["args"]))
+    derandomize_model(model)
+    @TinyJit
+    def test(t): return model(t, 0).realize()
+    # NOTE: only test one pass, not testing the dynamic shape autoregressive part
+    helper_test("test_llama", lambda: (Tensor([[1,]]),), test, 0.22 if CI else 13.5, 126 if CI else 486, all_jitted=True)
+
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM"] or not CI), "needs JIT, too long on CI LLVM")
+  def test_gpt2(self):
+    Tensor.default_type = dtypes.float16
+
+    args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000}
+    model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"]))
+    derandomize_model(model)
+    @TinyJit
+    def test(t): return model(t, 0).realize()
+    helper_test("test_gpt2", lambda: (Tensor([[1,]]),), test, 0.21 if CI else 0.9, 129 if CI else 369, all_jitted=True)
+
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM", "CLANG"] or not CI), "needs JIT, too long on CI LLVM and CLANG")
+  def test_train_cifar(self):
+    # TODO: with default device
+    #old_default = Device.DEFAULT
+    #Device.DEFAULT = "FAKE"
+    #Device['fake'].codegen = Device[old_default].codegen
+
+    with Tensor.train():
+      model = SpeedyResNet(Tensor.ones((12,3,2,2)))
+      optimizer = optim.SGD(get_parameters(model), lr=0.01, momentum=0.8, nesterov=True, weight_decay=0.15)
+
+      BS = 32 if CI else 512
+
+      @TinyJit
+      def train(X):
+        out = model(X)
+        loss = out.mean()
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+      helper_test("train_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), train, (1.0/48)*BS, 154)   # it's 154 on metal
+
+      # reset device
+      #Device.DEFAULT = old_default
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_rnnt.py
+++ b/tinygrad_repo/test/models/test_rnnt.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from models.rnnt import LSTM
+import torch
+
+class TestRNNT(unittest.TestCase):
+  def test_lstm(self):
+    BS, SQ, IS, HS, L = 2, 20, 40, 128, 2
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.LSTM(IS, HS, L)
+
+    # create in tinygrad
+    layer = LSTM(IS, HS, L, 0.0)
+
+    # copy weights
+    with torch.no_grad():
+      layer.cells[0].weights_ih.assign(Tensor(torch_layer.weight_ih_l0.numpy()))
+      layer.cells[0].weights_hh.assign(Tensor(torch_layer.weight_hh_l0.numpy()))
+      layer.cells[0].bias_ih.assign(Tensor(torch_layer.bias_ih_l0.numpy()))
+      layer.cells[0].bias_hh.assign(Tensor(torch_layer.bias_hh_l0.numpy()))
+      layer.cells[1].weights_ih.assign(Tensor(torch_layer.weight_ih_l1.numpy()))
+      layer.cells[1].weights_hh.assign(Tensor(torch_layer.weight_hh_l1.numpy()))
+      layer.cells[1].bias_ih.assign(Tensor(torch_layer.bias_ih_l1.numpy()))
+      layer.cells[1].bias_hh.assign(Tensor(torch_layer.bias_hh_l1.numpy()))
+
+    # test initial hidden
+    for _ in range(3):
+      x = Tensor.randn(SQ, BS, IS)
+      z, hc = layer(x, None)
+      torch_x = torch.tensor(x.numpy())
+      torch_z, torch_hc = torch_layer(torch_x)
+      np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+    # test passing hidden
+    for _ in range(3):
+      x = Tensor.randn(SQ, BS, IS)
+      z, hc = layer(x, hc)
+      torch_x = torch.tensor(x.numpy())
+      torch_z, torch_hc = torch_layer(torch_x, torch_hc)
+      np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_train.py
+++ b/tinygrad_repo/test/models/test_train.py
@@ -0,0 +1,83 @@
+import unittest
+import time
+import numpy as np
+from tinygrad.nn.state import get_parameters
+from tinygrad.nn import optim
+from tinygrad.tensor import Device
+from tinygrad.helpers import getenv
+from extra.training import train
+from models.convnext import ConvNeXt
+from models.efficientnet import EfficientNet
+from models.transformer import Transformer
+from models.vit import ViT
+from models.resnet import ResNet18
+import pytest
+
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+BS = getenv("BS", 2)
+
+def train_one_step(model,X,Y):
+  params = get_parameters(model)
+  pcount = 0
+  for p in params:
+    pcount += np.prod(p.shape)
+  optimizer = optim.SGD(params, lr=0.001)
+  print("stepping %r with %.1fM params bs %d" % (type(model), pcount/1e6, BS))
+  st = time.time()
+  train(model, X, Y, optimizer, steps=1, BS=BS)
+  et = time.time()-st
+  print("done in %.2f ms" % (et*1000.))
+
+def check_gc():
+  if Device.DEFAULT == "GPU":
+    from extra.introspection import print_objects
+    assert print_objects() == 0
+
+class TestTrain(unittest.TestCase):
+  def test_convnext(self):
+    model = ConvNeXt(depths=[1], dims=[16])
+    X = np.zeros((BS,3,224,224), dtype=np.float32)
+    Y = np.zeros((BS), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  def test_efficientnet(self):
+    model = EfficientNet(0)
+    X = np.zeros((BS,3,224,224), dtype=np.float32)
+    Y = np.zeros((BS), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "too many buffers for webgpu")
+  def test_vit(self):
+    model = ViT()
+    X = np.zeros((BS,3,224,224), dtype=np.float32)
+    Y = np.zeros((BS,), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  def test_transformer(self):
+    # this should be small GPT-2, but the param count is wrong
+    # (real ff_dim is 768*4)
+    model = Transformer(syms=10, maxlen=6, layers=12, embed_dim=768, num_heads=12, ff_dim=768//4)
+    X = np.zeros((BS,6), dtype=np.float32)
+    Y = np.zeros((BS,6), dtype=np.int32)
+    train_one_step(model,X,Y)
+    check_gc()
+
+  def test_resnet(self):
+    X = np.zeros((BS, 3, 224, 224), dtype=np.float32)
+    Y = np.zeros((BS), dtype=np.int32)
+    for resnet_v in [ResNet18]:
+      model = resnet_v()
+      model.load_from_pretrained()
+      train_one_step(model, X, Y)
+    check_gc()
+
+  def test_bert(self):
+    # TODO: write this
+    pass
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_waifu2x.py
+++ b/tinygrad_repo/test/models/test_waifu2x.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+import pathlib
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+
+class TestVGG7(unittest.TestCase):
+  def test_vgg7(self):
+    from examples.vgg7_helpers.waifu2x import Vgg7, image_load
+
+    # Create in tinygrad
+    Tensor.manual_seed(1337)
+    mdl = Vgg7()
+    mdl.load_from_pretrained()
+
+    # Scale up an image
+    test_x = image_load(pathlib.Path(__file__).parent / 'waifu2x/input.png')
+    test_y = image_load(pathlib.Path(__file__).parent / 'waifu2x/output.png')
+    scaled = mdl.forward_tiled(test_x, 156)
+    scaled = np.fmax(0, np.fmin(1, scaled))
+    np.testing.assert_allclose(scaled, test_y, atol=5e-3, rtol=5e-3)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/models/test_whisper.py
+++ b/tinygrad_repo/test/models/test_whisper.py
@@ -0,0 +1,25 @@
+import unittest
+import pathlib
+from tinygrad.ops import Device
+from examples.whisper import init_whisper, transcribe_file
+
+@unittest.skipUnless(Device.DEFAULT == "METAL", "Some non-metal backends spend too long trying to allocate a 20GB array")
+class TestWhisper(unittest.TestCase):
+  @classmethod
+  def setUpClass(cls):
+    model, enc = init_whisper("tiny.en")
+    cls.model = model
+    cls.enc = enc
+
+  @classmethod
+  def tearDownClass(cls):
+    del cls.model
+    del cls.enc
+
+  def test_transcribe_file(self):
+    # Audio generated with the command on MacOS:
+    # say "Could you please let me out of the box?" --file-format=WAVE  --data-format=LEUI8@16000 -o test
+    # We use the WAVE type because it's easier to decode in CI test environments
+    filename = str(pathlib.Path(__file__).parent / "whisper/test.wav")
+    transcription = transcribe_file(self.model, self.enc, filename)
+    self.assertEqual("<|startoftranscript|><|notimestamps|> Could you please let me out of the box?<|endoftext|>",  transcription)
--- a/tinygrad_repo/test/models/waifu2x/input.png
+++ b/tinygrad_repo/test/models/waifu2x/input.png
--- a/tinygrad_repo/test/models/waifu2x/output.png
+++ b/tinygrad_repo/test/models/waifu2x/output.png
--- a/tinygrad_repo/test/models/whisper/test.wav
+++ b/tinygrad_repo/test/models/whisper/test.wav
--- a/tinygrad_repo/test/test_allocators.py
+++ b/tinygrad_repo/test/test_allocators.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from weakref import ref
+from tinygrad.helpers import GlobalCounters
+from tinygrad.runtime.lib import RawBuffer, LRUAllocator
+from tinygrad.helpers import dtypes, prod
+from tinygrad.ops import Device
+from tinygrad.tensor import Tensor
+
+def check_gc():
+  if Device.DEFAULT == "GPU":
+    from extra.introspection import print_objects
+    assert print_objects() == 0
+
+class FakeDeviceBuffer:
+  def __init__(self, sz, dt, device):
+    self.id = 1
+    self.size = sz
+    self.dtype = dt
+    self.device = device
+  def __del__(self):
+    assert self.id == 0, "Should called _do_free() before"
+
+class FakeAllocator(LRUAllocator):
+  def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
+  def _do_free(self, buf):
+    buf.id -= 1
+    assert buf.id == 0, f"Free should be called once, but {buf.id}"
+  def __del__(self): # Fake allocator should clear all buffers after each test.
+    for v in self.cached_buffers.values():
+      for buf, _ in v: self._free_buffer(buf)
+
+FAKE_GLOBAL_ALLOCATOR = None
+class FakeBuffer(RawBuffer):
+  def __init__(self, size, dtype, device='0'):
+    global FAKE_GLOBAL_ALLOCATOR
+    super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
+    assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
+  @classmethod
+  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
+  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
+
+def alloc(allocator, size, dtype, **kwargs):
+  global FAKE_GLOBAL_ALLOCATOR
+  FAKE_GLOBAL_ALLOCATOR = allocator
+  buf = FakeBuffer(size, dtype, **kwargs)
+  assert buf.dtype == dtype and buf.size == size
+  FAKE_GLOBAL_ALLOCATOR = None
+  return buf
+
+def alloc_free_trace(allocator, size, dtype, **kwargs):
+  buf = alloc(allocator, size, dtype, **kwargs)
+  return ref(buf._buf)
+
+def cmp_trace_and_buf(buf, trace_ref): return trace_ref and trace_ref() == buf._buf
+
+class TestAllocators(unittest.TestCase):
+  def test_lru_allocator_reusage(self):
+    mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
+    def test():
+      lru_allocator = FakeAllocator(2048)
+      traced_buf = alloc_free_trace(lru_allocator, 16, dtypes.float32)
+      assert GlobalCounters.mem_cached - mc == 16*dtypes.float32.itemsize, "Buffer should be cached"
+      for _ in range(32):
+        def __test():
+          buf = alloc(lru_allocator, 16, dtypes.float32)
+          assert cmp_trace_and_buf(buf, traced_buf), "Buffer should be reused"
+        __test()
+
+      usedbuf = alloc(lru_allocator, 16, dtypes.float32)
+      for _ in range(32):
+        def __test():
+          buf = alloc(lru_allocator, 16, dtypes.float32)
+          assert usedbuf != buf, "Nobody should get used buffer"
+        __test()
+      assert GlobalCounters.mem_used - mu == 16*dtypes.float32.itemsize, "Only usedbuf is still allocated."
+    test()
+    check_gc()
+
+  def test_lru_allocator_cache_free(self):
+    mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
+    def test():
+      lru_allocator = FakeAllocator(128)
+      refs = []
+      for _ in range(32):
+        refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32))
+      for sz in range(1, 32):
+        alloc_free_trace(lru_allocator, sz, dtypes.float32)
+        assert GlobalCounters.mem_used + GlobalCounters.mem_cached - mc - mu <= 128, "Should not allocate on device more than allowed (128)"
+      for r in refs: assert r() is None, "All refs should be dead, since buffers were cleared from cache"
+    test()
+    check_gc()
+
+  def test_lru_allocator_multidevice(self):
+    def test():
+      lru_allocator = FakeAllocator(256)
+      refs=[]
+      for i in range(8):
+        refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32, device=str(i)))
+      for i in range(64):
+        def __test():
+          dev = str(i % 8)
+          buf = alloc(lru_allocator, 16, dtypes.float32, device=dev)
+          assert cmp_trace_and_buf(buf, refs[i%8]), "Buffer should be reused"
+        __test()
+      for r in refs: assert r() is not None, "All refs should be cached"
+    test()
+    check_gc()
+
+  @unittest.skip("failing in CI")
+  def test_gpu_copyout(self):
+    def test():
+      from tinygrad.runtime.ops_gpu import CL
+
+      # Allocation to init the allocator.
+      tx = Tensor.rand(1)
+      tx.realize()
+      free_space = CL.cl_allocator.free_space[tx.lazydata.realized._device]
+
+      # Spawning 128mb objects to fill half of free_space
+      will_allocate = free_space // 3
+      trash_allocation_size = free_space // 2
+
+      def sp():
+        trash_buffer = Tensor.rand(trash_allocation_size // 4)
+        trash_buffer.realize()
+      sp()
+
+      xx = Tensor.rand(will_allocate // 4)
+      _ = xx.numpy()
+    test()
+    check_gc()
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/test_assign.py
+++ b/tinygrad_repo/test/test_assign.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+from tinygrad.helpers import dtypes
+
+N = 200  # has to be bigger than the cache to fail
+
+class TestAssign(unittest.TestCase):
+  def test_simple_assignment(self):
+    a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
+    b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
+    a.realize()
+    b.realize()
+    ba1 = a.lazydata.realized
+    bb1 = b.lazydata.realized
+    a += b
+    a.realize()
+    ba2 = a.lazydata.realized
+    assert ba1 == ba2 and ba1 != bb1
+    np.testing.assert_allclose(a.numpy(), (np.arange(N*N)*2).reshape((N,N)))
+
+  @unittest.skipIf(Device.DEFAULT == "CPU" or Device.DEFAULT == "TORCH", "questionable tests")
+  def test_permuted_assignment(self):
+    a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
+    b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
+    a.realize()
+    b.realize()
+    ba1 = a.lazydata.realized
+    bb1 = b.lazydata.realized
+    a = a.permute(1,0)
+    a += b
+    a.realize()
+    ba2 = a.lazydata.realized
+    assert ba1 != ba2 and ba1 != bb1
+    np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
+
+  def test_post_permuted_assignment(self):
+    a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
+    b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
+    a.realize()
+    b.realize()
+    #GlobalCounters.cache = []
+    ba1 = a.lazydata.realized
+    bb1 = b.lazydata.realized
+    a.assign(a.permute(1,0) + b)   # this should not work!
+    a.realize()
+    ba2 = a.lazydata.realized
+    # NOTE: don't test that it's assigned
+    #assert ba1 == ba2 and ba1 != bb1
+    np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
+
+  # TODO: is there a way to sneak in a permute such that it returns the wrong answer?
+
+  def test_cast_assignment(self):
+    a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
+    a.realize()
+    oba1 = a.lazydata.output_buffer
+    a.assign(a.cast(dtypes.int32).realize())
+    a.realize()
+    oba2 = a.lazydata.output_buffer
+    assert oba1 is None and oba2 is None
+    np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/test_conv.py
+++ b/tinygrad_repo/test/test_conv.py
@@ -0,0 +1,147 @@
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor, Device
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda]
+
+class TestConv(unittest.TestCase):
+  def test_simple(self):
+    x = Tensor.ones(1,12,128,256).contiguous().realize()
+    w = Tensor.ones(32,12,3,3).contiguous().realize()
+    ret = x.conv2d(w, stride=(2,2), padding=(1,1)).numpy()
+    # it's not 108 around the padding
+    assert (ret[:, :, 1:-1, 1:-1] == 108).all()
+    assert ret[0,0,0,0] == 48
+    assert ret[0,0,0,1] == 72
+
+  def test_simple_rand(self):
+    x = Tensor.rand(1,12,128,256)
+    w = Tensor.rand(32,12,3,3)
+    ret = x.conv2d(w, stride=(2,2), padding=(1,1)).numpy()
+
+  def test_many_simple(self):
+    x = Tensor(np.arange(8*2*8).reshape(1,8,2,8).astype(np.float32))
+    #w = Tensor(np.arange(8*8*1*1).reshape(8,8,1,1).astype(np.float32))
+    w = Tensor.eye(8).reshape((8,8,1,1))
+    ret = x.conv2d(w, stride=(1,2), padding=(0,0)).numpy()
+    print(ret)
+
+  def test_lazycache(self):
+    Tensor.no_grad = True
+    x = Tensor.rand(1, 32)
+    y = Tensor.rand(32)
+    out = x + y.reshape((1,32,1)).reshape((1,32)) + y.reshape((1,32,1)).reshape((1,32))
+    out.numpy()
+    Tensor.no_grad = False
+
+  def test_simple_biased(self):
+    C = 8
+    x = Tensor.rand(1,C,5,5)
+    w = Tensor.eye(C).reshape((C,C,1,1))
+    b = Tensor(np.arange(C).astype(np.float32))
+    ret = Tensor.conv2d(x,w,b).relu().conv2d(w,b)
+
+    print(ret.numpy())
+
+  def test_two_binops_no_rerun(self):
+    Tensor.no_grad = True
+    x = Tensor.randn(1,12,128,256)
+    w = Tensor.randn(32,12,3,3)
+    out = x.conv2d(w, stride=(2,2), padding=(1,1))
+    r1, r2 = out.relu(), (out-1)
+    np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
+    np.testing.assert_allclose(r2.numpy(), out.numpy() - 1)
+    Tensor.no_grad = False
+
+  def test_two_overlapping_binops_no_rerun(self):
+    Tensor.no_grad = True
+    x = Tensor.randn(1,12,128,256)
+    w = Tensor.randn(32,12,3,3)
+    out = x.conv2d(w, stride=(2,2), padding=(1,1))
+    r1, r2 = out.relu(), out.elu()
+    np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
+    np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5)
+    Tensor.no_grad = False
+
+  @unittest.skipIf(Device.DEFAULT != "TORCH", "Takes too long to compile for Compiled backends")
+  def test_two_overlapping_binops_no_rerun_wino(self):
+    Tensor.no_grad = True
+    old_wino = Tensor.wino
+    Tensor.wino = True
+    x = Tensor.randn(1,4,16,16)
+    w = Tensor.randn(6,4,3,3)
+    out = x.conv2d(w, padding=(1,1))
+    r1, r2 = out.relu(), out.elu()
+    np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
+    np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5)
+    Tensor.wino = old_wino
+    Tensor.no_grad = False
+
+  def test_first_three(self):
+    Tensor.no_grad = True
+    x = Tensor.rand(1,12,128,256)
+
+    w = Tensor.rand(32,12,3,3)
+    x = x.conv2d(w, stride=(2,2), padding=(1,1)).elu()
+
+    w = Tensor.rand(32,1,3,3)
+    x = x.conv2d(w, padding=(1,1), groups=32).elu()
+
+    w = Tensor.rand(16,32,1,1)
+    x = x.conv2d(w).elu()
+
+    x = x.numpy()
+    print(x.shape)
+    Tensor.no_grad = False
+
+  def test_elu(self):
+    Tensor.no_grad = True
+    x = Tensor.rand(1,12,128,256)
+
+    w = Tensor.rand(32,12,3,3)
+    x = x.conv2d(w, stride=(2,2), padding=(1,1))
+
+    x = x.elu()
+
+    w = Tensor.rand(32,1,3,3)
+    x = x.conv2d(w, padding=(1,1), groups=32)
+    out = x.numpy()
+    Tensor.no_grad = False
+
+  def test_reduce_relu(self):
+    Tensor.no_grad = True
+    x = Tensor.rand(1,12,128,256)
+    x = x.sum(keepdim=True).relu()
+    out = x.numpy()
+    Tensor.no_grad = False
+
+  def test_bias(self):
+    Tensor.no_grad = True
+    from tinygrad.nn import Conv2d
+    x = Tensor.rand(1,12,128,256)
+    c = Conv2d(12, 32, 3)
+    x = c(x).relu()
+    w = Tensor.uniform(32, 1, 3, 3)
+    x = x.conv2d(w, groups=32)
+    out = x.numpy()
+    Tensor.no_grad = False
+
+  def test_multiadd(self):
+    w = Tensor.rand(32)
+    x = Tensor.rand(32).relu()
+    (w+x).numpy()
+
+  def test_reorder(self):
+    x = Tensor.rand(1,12,128,256)
+    w = Tensor.rand(12,12,3,3)
+    x = x.conv2d(w, padding=(1,1))
+    print(x.shape)
+    x = x.reshape((1, 12, 256, 128))
+    x += 1
+    x += 1
+    x = x.reshape((1, 12, 128, 256))
+    x.numpy()
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_conv_shapetracker.py
+++ b/tinygrad_repo/test/test_conv_shapetracker.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.tensor import Tensor, Device
+from tinygrad.nn import Conv2d
+from tinygrad.jit import CacheCollector
+import pytest
+
+pytestmark = pytest.mark.webgpu
+
+#@unittest.skipUnless(Device.DEFAULT == "GPU", "Only GPU supports cache")
+@unittest.skip("with JIT changes, you only get the raw buffer")
+class TestConvShapetracker(unittest.TestCase):
+  def test_conv_3x3_one_view(self):
+    inp = Tensor.randn(1,16,10,10).realize()
+    conv = Conv2d(16, 32, (3,3))
+    conv(inp).realize()
+    CacheCollector.start()
+    conv(inp).realize()
+    test = CacheCollector.finish()
+    assert len(test) == 1, f"conv should only have one kernel {[x[0].name for x in test]}"
+    print(test[0][0].prg)
+    for arg in test[0][1]:
+      print(arg.st)
+      assert len(arg.st.views) == 1
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_custom_function.py
+++ b/tinygrad_repo/test/test_custom_function.py
@@ -0,0 +1,107 @@
+# this is an example of how you can write terrible DSP compute breaking ops like warpPerspective
+# here we use a CUSTOM op to write atan2
+
+import unittest
+import numpy as np
+from typing import Optional, Tuple
+from tinygrad.helpers import prod, dtypes
+
+# *** first, we implement the atan2 op at the lowest level ***
+# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
+from tinygrad.lazy import LazyBuffer, create_lazybuffer
+from tinygrad.ops import ASTRunner, Device
+from tinygrad.shape.shapetracker import ShapeTracker
+import pytest
+
+pytestmark = pytest.mark.webgpu
+
+# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
+def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
+  assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
+  assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
+  ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
+  ASTRunner("atan2_gpu", """
+    __kernel void atan2_gpu(global float *c, global float *a, global float *b) {
+      int idx = get_global_id(0);
+      c[idx] = atan2(a[idx], b[idx]);
+    }""", global_size=[prod(ret.shape)]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
+  return ret.realized
+
+def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
+  return Device[ret.device].from_underlying(np.arctan2(a.realized._buf, b.realized._buf))
+
+# *** second, we write the ATan2 mlop ***
+# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
+# In general, it is also optional to write a backward function, just your backward pass won't work without it
+
+from tinygrad.ops import LazyOp, LoadOps, BinaryOps, UnaryOps
+from tinygrad.lazy import LazyBuffer
+from tinygrad.tensor import Function
+
+class ATan2(Function):
+  def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
+    assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
+    self.a, self.b = a, b
+    ast = LazyOp(LoadOps.CUSTOM, (a.contiguous(), b.contiguous()), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
+    return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), LoadOps, ast, max(a.dtype, b.dtype))
+  def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
+    denom = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b))
+    return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
+           grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.SUB, self.a).e(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None
+
+# *** third, we use our lovely new mlop in some tests ***
+
+from tinygrad.tensor import Tensor
+
+@unittest.skipUnless(Device.DEFAULT in ["CPU", "GPU"], "atan2 is only implemented for CPU and GPU")
+class TestCustomFunction(unittest.TestCase):
+  def test_atan2_forward(self):
+    # create some random Tensors, permute them just because we can
+    a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
+    b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
+
+    # run the forward pass. note: up until the .numpy(), it's all lazy
+    c = ATan2.apply(a, b)
+    print(c.numpy())
+
+    # check the forward pass (in numpy)
+    np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
+
+  # fun fact, this never actually calls forward, so it works in all the backends
+  def test_atan2_backward(self):
+    # have to go forward before we can go backward
+    a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
+    b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
+    c = ATan2.apply(a, b)
+
+    # run the backward pass
+    c.mean().backward()
+    assert a.grad is not None and b.grad is not None, "tinygrad didn't compute gradients"
+    print(a.grad.numpy())
+    print(b.grad.numpy())
+
+    # check the backward pass (in torch)
+    import torch
+    ta, tb = torch.tensor(a.numpy(), requires_grad=True), torch.tensor(b.numpy(), requires_grad=True)
+    tc = torch.atan2(ta, tb)
+    tc.mean().backward()
+    assert ta.grad is not None and tb.grad is not None, "torch didn't compute gradients"
+    np.testing.assert_allclose(a.grad.numpy(), ta.grad.numpy(), atol=1e-5)
+    np.testing.assert_allclose(b.grad.numpy(), tb.grad.numpy(), atol=1e-5)
+
+  def test_atan2_jit(self):
+    # custom ops even work in the JIT!
+    from tinygrad.jit import TinyJit
+
+    @TinyJit
+    def jitted_atan2(a:Tensor, b:Tensor) -> Tensor:
+      return ATan2.apply(a, b).realize()
+
+    for _ in range(5):
+      a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
+      b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
+      c = jitted_atan2(a, b)
+      np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/test_dtype.py
+++ b/tinygrad_repo/test/test_dtype.py
@@ -0,0 +1,182 @@
+import unittest
+import numpy as np
+from tinygrad.helpers import CI, DTYPES_DICT, getenv, DType, DEBUG, ImageDType, PtrDType
+from tinygrad.ops import Device
+from tinygrad.tensor import Tensor, dtypes
+from typing import Any, List
+from extra.utils import OSX, temp
+
+def is_dtype_supported(dtype: DType):
+  # for GPU, cl_khr_fp16 isn't supported (except now we don't need it!)
+  # for LLVM, it segfaults because it can't link to the casting function
+  if dtype == dtypes.half: return not (CI and Device.DEFAULT in ["GPU", "LLVM"]) and Device.DEFAULT != "WEBGPU" and getenv("CUDACPU") != 1
+  if dtype == dtypes.bfloat16: return False # numpy doesn't support bf16, tested separately in TestBFloat16DType
+  if dtype == dtypes.float64: return Device.DEFAULT not in ["WEBGPU", "METAL"] and not OSX
+  if dtype in [dtypes.int8, dtypes.uint8]: return Device.DEFAULT not in ["WEBGPU"]
+  if dtype in [dtypes.int16, dtypes.uint16]: return Device.DEFAULT not in ["WEBGPU", "TORCH"]
+  if dtype == dtypes.uint32: return Device.DEFAULT not in ["TORCH"]
+  if dtype in [dtypes.int64, dtypes.uint64]: return Device.DEFAULT not in ["WEBGPU", "TORCH"]
+  if dtype == dtypes.bool:
+   # host-shareablity is a requirement for storage buffers, but 'bool' type is not host-shareable
+    if Device.DEFAULT == "WEBGPU": return False
+   # TODO remove triton from here once internal casting is fixed. CAST of fp32s between 0-1 is broken in triton
+    if getenv("TRITON") == 1: return False
+  return True
+
+def get_available_cast_dtypes(dtype: DType) -> List[DType]: return [v for k, v in DTYPES_DICT.items() if v != dtype and is_dtype_supported(v) and not k.startswith("_")] # dont cast internal dtypes
+
+def _test_to_np(a:Tensor, np_dtype, target):
+  if DEBUG >= 2: print(a)
+  na = a.numpy()
+  if DEBUG >= 2: print(na, na.dtype, a.lazydata.realized)
+  try:
+    assert na.dtype == np_dtype
+    np.testing.assert_allclose(na, target)
+  except AssertionError as e:
+    raise AssertionError(f"\ntensor {a.numpy()} does not match target {target} with np_dtype {np_dtype}") from e
+
+def _assert_eq(tensor:Tensor, target_dtype:DType, target):
+  if DEBUG >= 2: print(tensor.numpy())
+  try:
+    assert tensor.dtype == target_dtype
+    np.testing.assert_allclose(tensor.numpy(), target)
+  except AssertionError as e:
+    raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e
+
+def _test_op(fxn, target_dtype:DType, target): _assert_eq(fxn(), target_dtype, target)
+def _test_cast(a:Tensor, target_dtype:DType): _test_op(lambda: a.cast(target_dtype), target_dtype, a.numpy().astype(target_dtype.np).tolist())
+def _test_bitcast(a:Tensor, target_dtype:DType, target): _test_op(lambda: a.bitcast(target_dtype), target_dtype, target)
+
+class TestDType(unittest.TestCase):
+  DTYPE: Any = None
+  DATA: Any = None
+  @classmethod
+  def setUpClass(cls):
+    if not is_dtype_supported(cls.DTYPE): raise unittest.SkipTest("dtype not supported")
+    cls.DATA = np.random.randint(0, 100, size=10, dtype=cls.DTYPE.np).tolist() if dtypes.is_int(cls.DTYPE) else np.random.choice([True, False], size=10).tolist() if cls.DTYPE == dtypes.bool else np.random.uniform(0, 1, size=10).tolist()
+  def setUp(self):
+    if self.DTYPE is None: raise unittest.SkipTest("base class")
+
+  def test_to_np(self): _test_to_np(Tensor(self.DATA, dtype=self.DTYPE), self.DTYPE.np, np.array(self.DATA, dtype=self.DTYPE.np))
+
+  def test_casts_to(self): list(map(
+    lambda dtype: _test_cast(Tensor(self.DATA, dtype=dtype), self.DTYPE),
+    get_available_cast_dtypes(self.DTYPE)
+  ))
+  def test_casts_from(self): list(map(
+    lambda dtype: _test_cast(Tensor(self.DATA, dtype=self.DTYPE), dtype),
+    get_available_cast_dtypes(self.DTYPE)
+  ))
+
+  def test_upcast_ops(self): list(map(
+    lambda dtype: _test_ops(a_dtype=self.DTYPE, b_dtype=dtype, target_dtype=dtype) if dtype.sz > self.DTYPE.sz else None,
+    get_available_cast_dtypes(self.DTYPE)
+  ))
+  def test_upcast_to_ops(self): list(map(
+    lambda dtype: _test_ops(a_dtype=dtype, b_dtype=self.DTYPE, target_dtype=self.DTYPE) if dtype.sz < self.DTYPE.sz else None,
+    get_available_cast_dtypes(self.DTYPE)
+  ))
+
+def _test_ops(a_dtype:DType, b_dtype:DType, target_dtype:DType):
+  if not is_dtype_supported(a_dtype) or not is_dtype_supported(b_dtype): raise unittest.SkipTest("dtype not supported")
+  _assert_eq(Tensor([1,2,3,4], dtype=a_dtype)+Tensor([1,2,3,4], dtype=b_dtype), target_dtype, [2,4,6,8])
+  _assert_eq(Tensor([1,2,3,4], dtype=a_dtype)*Tensor([1,2,3,4], dtype=b_dtype), target_dtype, [1,4,9,16])
+  _assert_eq(Tensor([[1,2],[3,4]], dtype=a_dtype)@Tensor.eye(2, dtype=b_dtype), target_dtype, [[1,2],[3,4]])
+  _assert_eq(Tensor([1,1,1,1], dtype=a_dtype)+Tensor.ones((4,4), dtype=b_dtype), target_dtype, 2*Tensor.ones(4,4).numpy())
+
+class TestBFloat16DType(unittest.TestCase):
+  def setUp(self):
+    if not is_dtype_supported(dtypes.bfloat16): raise unittest.SkipTest("bfloat16 not supported")
+  def test_bf16_to_float(self):
+    with self.assertRaises(AssertionError):
+      _test_cast(Tensor([100000], dtype=dtypes.bfloat16), dtypes.float32, [100000])
+
+  def test_float_to_bf16(self):
+    with self.assertRaises(AssertionError):
+      _test_cast(Tensor([100000], dtype=dtypes.float32), dtypes.bfloat16, [100000])
+
+  # torch.tensor([10000, -1, -1000, -10000, 20]).type(torch.bfloat16)
+
+  def test_bf16(self):
+    t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.bfloat16)
+    t.realize()
+    back = t.cast(dtypes.float32)
+    assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)
+
+  def test_bf16_disk_write_read(self):
+    t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.float32)
+    t.to(f"disk:{temp('f32')}").realize()
+
+    # hack to "cast" f32 -> bf16
+    dat = open(temp('f32'), "rb").read()
+    adat = b''.join([dat[i+2:i+4] for i in range(0, len(dat), 4)])
+    with open(temp('bf16'), "wb") as f: f.write(adat)
+
+    t = Tensor.empty(5, dtype=dtypes.bfloat16, device=f"disk:{temp('bf16')}").llvm().realize()
+    back = t.cast(dtypes.float32)
+    assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)
+
+class TestHalfDtype(TestDType): DTYPE = dtypes.half
+
+class TestFloatDType(TestDType): DTYPE = dtypes.float
+
+class TestDoubleDtype(TestDType): DTYPE = dtypes.double
+
+class TestInt8Dtype(TestDType):
+  DTYPE = dtypes.int8
+  @unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
+  def test_int8_to_uint8_negative(self): _test_op(lambda: Tensor([-1, -2, -3, -4], dtype=dtypes.int8).cast(dtypes.uint8), dtypes.uint8, [255, 254, 253, 252])
+
+class TestUint8Dtype(TestDType):
+  DTYPE = dtypes.uint8
+  @unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
+  def test_uint8_to_int8_overflow(self): _test_op(lambda: Tensor([255, 254, 253, 252], dtype=dtypes.uint8).cast(dtypes.int8), dtypes.int8, [-1, -2, -3, -4])
+
+@unittest.skipIf(Device.DEFAULT not in {"CPU", "TORCH"}, "only bitcast in CPU and TORCH")
+class TestBitCast(unittest.TestCase):
+  def test_float32_bitcast_to_int32(self): _test_bitcast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.int32, [1065353216, 1073741824, 1077936128, 1082130432])
+  @unittest.skipIf(Device.DEFAULT == "TORCH", "no uint32 in torch")
+  def test_float32_bitcast_to_uint32(self): _test_bitcast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.uint32, [1065353216, 1073741824, 1077936128, 1082130432])
+  def test_int32_bitcast_to_float32(self): _test_bitcast(Tensor([1065353216, 1073741824, 1077936128, 1082130432], dtype=dtypes.int32), dtypes.float32, [1.0, 2.0, 3.0, 4.0])
+
+  # NOTE: these are the same as normal casts
+  def test_int8_bitcast_to_uint8(self): _test_bitcast(Tensor([-1, -2, -3, -4], dtype=dtypes.int8), dtypes.uint8, [255, 254, 253, 252])
+  def test_uint8_bitcast_to_int8(self): _test_bitcast(Tensor([255, 254, 253, 252], dtype=dtypes.uint8), dtypes.int8, [-1, -2, -3, -4])
+  @unittest.skipIf(Device.DEFAULT == "TORCH", "no uint64 in torch")
+  def test_int64_bitcast_to_uint64(self): _test_bitcast(Tensor([-1, -2, -3, -4], dtype=dtypes.int64), dtypes.uint64, [18446744073709551615, 18446744073709551614, 18446744073709551613, 18446744073709551612])
+  @unittest.skipIf(Device.DEFAULT == "TORCH", "no uint64 in torch")
+  def test_uint64_bitcast_to_int64(self): _test_bitcast(Tensor([18446744073709551615, 18446744073709551614, 18446744073709551613, 18446744073709551612], dtype=dtypes.uint64), dtypes.int64, [-1, -2, -3, -4])
+
+  def test_shape_change_bitcast(self):
+    with self.assertRaises(AssertionError):
+      _test_bitcast(Tensor([100000], dtype=dtypes.float32), dtypes.uint8, [100000])
+
+class TestInt16Dtype(TestDType): DTYPE = dtypes.int16
+class TestUint16Dtype(TestDType): DTYPE = dtypes.uint16
+
+class TestInt32Dtype(TestDType): DTYPE = dtypes.int32
+class TestUint32Dtype(TestDType): DTYPE = dtypes.uint32
+
+class TestInt64Dtype(TestDType): DTYPE = dtypes.int64
+class TestUint64Dtype(TestDType): DTYPE = dtypes.uint64
+
+class TestBoolDtype(TestDType): DTYPE = dtypes.bool
+
+class TestEqStrDType(unittest.TestCase):
+  def test_image_ne(self):
+    assert dtypes.float == dtypes.float32, "float doesn't match?"
+    assert dtypes.imagef((1,2,4)) != dtypes.imageh((1,2,4)), "different image dtype doesn't match"
+    assert dtypes.imageh((1,2,4)) != dtypes.imageh((1,4,2)), "different shape doesn't match"
+    assert dtypes.imageh((1,2,4)) == dtypes.imageh((1,2,4)), "same shape matches"
+    assert isinstance(dtypes.imageh((1,2,4)), ImageDType)
+  def test_ptr_ne(self):
+    # TODO: is this the wrong behavior?
+    assert PtrDType(dtypes.float32) == dtypes.float32
+    #assert PtrDType(dtypes.float32) == PtrDType(dtypes.float32)
+    #assert PtrDType(dtypes.float32) != dtypes.float32
+  def test_strs(self):
+    self.assertEqual(str(dtypes.imagef((1,2,4))), "dtypes.imagef((1, 2, 4))")
+    self.assertEqual(str(PtrDType(dtypes.float32)), "ptr.dtypes.float")
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_gc.py
+++ b/tinygrad_repo/test/test_gc.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+import gc
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor
+
+def tensors_allocated():
+  return sum([isinstance(x, Tensor) for x in gc.get_objects()])
+
+class TestGC(unittest.TestCase):
+
+  def test_gc(self):
+    a = Tensor.zeros(4, 4, requires_grad=True)
+    b = Tensor.zeros(4, 4, requires_grad=True)
+    (a*b).mean().backward()
+    assert(tensors_allocated() > 0)
+    del a,b
+    assert(tensors_allocated() == 0)
+
+  def test_gc_complex(self):
+    a = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
+    b = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
+    assert(tensors_allocated() == 2)
+    (a*b).mean().backward()
+    assert(tensors_allocated() == 4)
+    del b
+    assert(tensors_allocated() == 2)
+    b = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
+    print(tensors_allocated())
+    (a*b).mean().backward()
+    print(tensors_allocated())
+    assert(tensors_allocated() == 4)
+    del b
+    assert(tensors_allocated() == 2)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_jit.py
+++ b/tinygrad_repo/test/test_jit.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor, Device
+from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
+import pytest
+
+pytestmark = pytest.mark.webgpu
+
+# NOTE: METAL fails, might be platform and optimization options dependent.
+@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["METAL", "WEBGPU"], f"no JIT on {Device.DEFAULT}")
+class TestJit(unittest.TestCase):
+  def test_simple_jit(self):
+    @TinyJit
+    def add(a, b): return (a+b).realize()
+    for _ in range(5):
+      a = Tensor.randn(10, 10)
+      b = Tensor.randn(10, 10)
+      c = add(a, b)
+      np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
+    assert len(add.jit_cache) == 1
+
+  def test_jit_multiple_outputs(self):
+    @TinyJit
+    def f(a, b): return (a+b).realize(), (a-b).realize(), (a*b).realize()
+    for _ in range(5):
+      a = Tensor.randn(10, 10)
+      b = Tensor.randn(10, 10)
+      c, d, e = f(a, b)
+      np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
+      np.testing.assert_allclose(d.numpy(), a.numpy()-b.numpy(), atol=1e-4, rtol=1e-5)
+      np.testing.assert_allclose(e.numpy(), a.numpy()*b.numpy(), atol=1e-4, rtol=1e-5)
+    assert len(f.jit_cache) == 3
+
+  def test_nothing_jitted(self):
+    @TinyJit
+    def add(a, b): return a+b
+    with self.assertRaises(AssertionError):
+      for _ in range(5):
+        a = Tensor.randn(10, 10)
+        b = Tensor.randn(10, 10)
+        c = add(a, b)
+
+  def test_jit_shape_mismatch(self):
+    @TinyJit
+    def add(a, b): return (a+b).realize()
+    for _ in range(5):
+      a = Tensor.randn(10, 10)
+      b = Tensor.randn(10, 10)
+      c = add(a, b)
+    bad = Tensor.randn(20, 20)
+    with self.assertRaises(AssertionError):
+      add(a, bad)
+
+  def test_jit_shape_views_mismatch(self):
+    @TinyJit
+    def add(a): return (a+1).realize()
+    with self.assertRaises(AssertionError):
+      for i in range(1,5):
+        # a has an offset that the kernel doesn't know about
+        a = Tensor.randn(10, 10).realize()[:, i:i+2]
+        add(a)
+
+  def test_jit_duplicate_fail(self):
+    # the jit doesn't support duplicate arguments
+    @TinyJit
+    def add(a, b): return (a+b).realize()
+    a = Tensor.randn(10, 10)
+    with self.assertRaises(AssertionError):
+      add(a, a)
+
+  def test_kwargs_jit(self):
+    @TinyJit
+    def add_kwargs(first, second): return (first+second).realize()
+    for _ in range(5):
+      a = Tensor.randn(10, 10)
+      b = Tensor.randn(10, 10)
+      c = add_kwargs(first=a, second=b)
+      np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
+    assert len(add_kwargs.jit_cache) == 1
+
+  def test_array_jit(self):
+    @TinyJit
+    def add_array(a, arr): return (a+arr[0]).realize()
+    for i in range(5):
+      a = Tensor.randn(10, 10)
+      b = Tensor.randn(10, 10)
+      a.realize(), b.realize()
+      c = add_array(a, [b])
+      if i >= 2:
+        # should fail once jitted since jit can't handle arrays
+        np.testing.assert_allclose(np.any(np.not_equal(c.numpy(),a.numpy()+b.numpy())), True, atol=1e-4, rtol=1e-5)
+      else:
+        np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
+    assert len(add_array.jit_cache) == 1
+
+  def test_method_jit(self):
+    class Fun:
+      def __init__(self):
+        self.a = Tensor.randn(10, 10)
+      @TinyJit
+      def __call__(self, b:Tensor) -> Tensor:
+        return (self.a+b).realize()
+    fun = Fun()
+    for _ in range(5):
+      b = Tensor.randn(10, 10)
+      c = fun(b)
+      np.testing.assert_allclose(c.numpy(), fun.a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
+    assert len(fun.__call__.func.__self__.jit_cache) == 1
+
+  def test_jit_size1_input(self):
+    @TinyJit
+    def f(a, b): return (a+b).realize()
+    a = Tensor([1, 2, 3])
+    for i in range(5):
+      np.testing.assert_allclose(f(a, Tensor([i])).numpy(), (a+i).numpy(), atol=1e-4, rtol=1e-5)
+    assert len(f.jit_cache) == 1
+
+  def test_jit_output_non_tensor_fail(self):
+    @TinyJit
+    def f(a, b, i): return (a+b).realize(), i
+    output1, output2 = [], []
+    expect1, expect2 = [], []
+    for i in range(5):
+      a = Tensor.randn(10, 10)
+      b = Tensor.randn(10, 10)
+      o1, o2 = f(a, b, i)
+      output1.append(o1.numpy().copy())
+      output2.append(o2)
+      expect1.append(a.numpy().copy()+b.numpy().copy())
+      expect2.append(i)
+    np.testing.assert_allclose(output1, expect1, atol=1e-4, rtol=1e-5)
+    # the jit only works with Tensor outputs
+    assert output2 != expect2
+    assert len(f.jit_cache) == 1
+
+  @unittest.skip("random isn't working in JIT")
+  def test_jit_random_regen(self):
+    def f(a, b):
+      rn = Tensor.randn(*a.shape)
+      return ((a+b)*rn).realize()
+    a = Tensor.randn(10, 10)
+    b = Tensor.randn(10, 10)
+
+    Tensor._seed = 1234
+    jf = TinyJit(f)
+    res = set()
+    for _ in range(5):
+      o1 = jf(a, b)
+      res.add(o1.numpy()[0][0])
+    assert len(res) == 5, "All values should be different, rand works in jit."
+
+    Tensor._seed = 1234
+    jf2 = TinyJit(f)
+    res2 = set()
+    for _ in range(5):
+      o1 = jf2(a, b)
+      res2.add(o1.numpy()[0][0])
+    assert len(res2) == 5, "All values should be different, rand works in jit."
+    assert res == res2, "Jit rand is not reproducible with the same seed"
+
+    Tensor._seed = 3421
+    jf3 = TinyJit(f)
+    res3 = set()
+    for _ in range(5):
+      o1 = jf3(a, b)
+      res3.add(o1.numpy()[0][0])
+    assert len(res3) == 5, "All values should be different, rand works in jit."
+    assert res3 != res2, "Jit rand is diff with diff seeds"
+
+  def test_jit_realization_and_sampling(self):
+    w = Tensor.eye(5)
+
+    @TinyJit
+    def foo (x): return w.dot(x).realize()
+
+    arg  = [
+        Tensor([1,2,3,4,5]),
+        Tensor([1,3,3,4,6]),
+        Tensor([1,2,5,4,7]),
+        Tensor([0,2,3,1,0]),
+    ]
+
+    Y = [foo(e).numpy() for e in arg]
+
+    foo(Tensor([7,7,7,7,7]))
+    want = [[1., 2., 3., 4., 5.],
+            [1., 3., 3., 4., 6.],
+            [1., 2., 5., 4., 7.],
+            [0., 2., 3., 1., 0.]]
+    np.testing.assert_allclose(want, Y)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_kernel_cache.py
+++ b/tinygrad_repo/test/test_kernel_cache.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+import unittest
+import secrets
+import string
+from tinygrad.tensor import Tensor
+from tinygrad.ops import Device
+from tinygrad.helpers import diskcache
+
+def generate_random_string(length=16):
+  alphabet = string.ascii_letters + string.digits
+  return ''.join(secrets.choice(alphabet) for _ in range(length))
+
+compile_call_count = 0
+
+@diskcache
+def helper_test_compile(prg:str) -> bytes:
+  global compile_call_count
+  compile_call_count += 1
+  return prg.encode()
+
+class TestKernelCache(unittest.TestCase):
+  def test_compile_cache(self):
+    prg1 = generate_random_string(64) + "a"
+    prg2 = generate_random_string(64) + "b"
+    cold_compile_res = helper_test_compile(prg1)
+    warm_compile_res = helper_test_compile(prg1)
+    assert cold_compile_res == warm_compile_res == prg1.encode()
+    assert compile_call_count == 1
+
+    prg2_res = helper_test_compile(prg2)
+    assert prg2_res == prg2.encode()
+    assert compile_call_count == 2
+
+  def test_kernel_cache_in_action(self):
+    if Device.DEFAULT not in ["CLANG"]:
+      self.skipTest("No custom kernel cache is implemented")
+
+    a = Tensor.rand(4,4)
+    b = Tensor.rand(4,4)
+    x = a + b
+    x.realize()
+
+    orig_compile_func = Device['CLANG'].compiler
+    Device['CLANG'].compiler = None # making it not callable
+
+    a1 = Tensor.rand(4,4)
+    b1 = Tensor.rand(4,4)
+    x1 = a1 + b1
+    x1.realize() # Same kernel should be from cache.
+
+    Device['CLANG'].compiler = orig_compile_func
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/test_lazybuffer.py
+++ b/tinygrad_repo/test/test_lazybuffer.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+import numpy as np
+import unittest
+from tinygrad.lazy import LazyBuffer
+from tinygrad.ops import Device
+from tinygrad.tensor import Tensor
+from tinygrad.shape.symbolic import Variable
+from tinygrad.jit import CacheCollector
+
+class TestLazyBuffer(unittest.TestCase):
+  def test_fromcpu_buffer_sharing(self):
+    a = np.arange(8)
+    assert LazyBuffer.fromCPU(a).realized._buf is a
+
+  def test_fromcpu_shape_tracker(self):
+    def helper(a: np.ndarray):
+      print(a.shape, a.strides, a.flags.c_contiguous)
+      b = LazyBuffer.fromCPU(a)
+      #assert b.st.contiguous == a.flags.c_contiguous
+      assert b.st.shape == a.shape
+      np.testing.assert_equal(a, Tensor(b).numpy())
+
+    for ndims in range(1, 4):
+      a = np.random.randn(*(4,)*ndims).astype(np.float32)
+      for stride in [-2, 1, 2]:
+        for start in [0, 1]:
+          helper(a[(slice(start, None, stride),)*ndims])
+
+  def test_shuffle_pad_ops_cmpeq(self):
+    y = Tensor([1]).cat(Tensor([1]) == 0).numpy()
+    z = Tensor([1, 0]).numpy()
+    np.testing.assert_allclose(y, z)
+
+  def test_shuffle_pad_ops_div(self):
+    y = Tensor([1]).cat(Tensor([1]).div(Tensor([2.0]))).numpy()
+    z = Tensor([1, 0.5]).numpy()
+    np.testing.assert_allclose(y, z)
+
+  def test_shuffle_pad_ops_log(self):
+    y = Tensor([1]).cat(Tensor([1]).log()).numpy()
+    z = Tensor([1, 0]).numpy()
+    np.testing.assert_allclose(y, z)
+
+  def test_shuffle_pad_ops_exp(self):
+    y = Tensor([1]).cat(Tensor([1]).exp()).numpy()
+    z = Tensor([1, np.e]).numpy()
+    np.testing.assert_allclose(y, z)
+
+  @unittest.skipUnless(Device.DEFAULT in ["METAL", "CUDA", "GPU"], "Only GPU backends supports cache")
+  def test_children_count(self):
+    a = Tensor.ones(8,8,8)
+    d1 = a.sum((0))
+    d2 = a.sum((0)).reshape(32,2)
+    assert len(d1.lazydata.op.src[0].children) == 1
+    in1 = d1.reshape(16,4)
+    d3 = in1.reshape(8,8)
+    assert len(d3.lazydata.op.src[0].children) == 2
+
+    CacheCollector.start()
+    l = Tensor.ones(8,8)
+    r = Tensor.ones(8,8)
+    dd = d1 + l
+    dd.realize()
+    de = d3 + r
+    de.realize()
+    cache = CacheCollector.finish()
+    assert len(cache) == 3
+    assert cache[0][0].name.startswith("r_") # Reduce should not merged 2 times.
+    assert cache[1][0].name.startswith("E_")
+    assert cache[2][0].name.startswith("E_")
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/test_lazyop.py
+++ b/tinygrad_repo/test/test_lazyop.py
@@ -0,0 +1,21 @@
+import unittest
+from tinygrad.tensor import Tensor
+
+# stuff needed to unpack a kernel
+from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
+from tinygrad.helpers import dtypes
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.shape.view import View
+from tinygrad.shape.symbolic import Variable
+inf, nan = float('inf'), float('nan')
+
+class TestLazyOp(unittest.TestCase):
+  def test_lazyop_str(self):
+    t = Tensor.rand(10) + Tensor.rand(10)
+    s = t.lazydata.schedule()
+    ast = s[-1].ast
+    ast_remade = eval(str(ast))
+    self.assertEqual(ast, ast_remade)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_linearizer.py
+++ b/tinygrad_repo/test/test_linearizer.py
@@ -0,0 +1,492 @@
+import numpy as np
+import unittest, os
+
+from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
+from tinygrad.codegen.linearizer import Linearizer, UOps
+from tinygrad.ops import Compiled, Device, LoadOps
+from tinygrad.tensor import Tensor
+from tinygrad.jit import CacheCollector
+from tinygrad.realize import run_schedule
+from tinygrad.helpers import dtypes, prod
+
+class TestLinearizer(unittest.TestCase):
+  def test_arg_dedup(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled supports cache")
+    a, b = Tensor.randn(4), Tensor.randn(4)
+    np_a, np_b = a.numpy(), b.numpy()
+    CacheCollector.start()
+    c = ((a.shrink(((0, 2),)) - a.shrink(((2, 4),))) - (b.shrink(((0, 2),)) - b.shrink(((2, 4),)))).realize()
+    rawbufs = CacheCollector.finish()[0][1]
+    assert len(rawbufs) == 3 and set(rawbufs[1:]) == {a.lazydata.realized, b.lazydata.realized}
+    np_c = (np_a[:2] - np_a[2:]) - (np_b[:2] - np_b[2:])
+    np.testing.assert_allclose(np_c, c.numpy(), atol=1e-4, rtol=1e-4)
+
+  def test_load_dedup(self):
+    # for different leaves in the AST, the same loads may occur.
+
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled uses linearizer")
+
+    a = Tensor.randn(4).realize()
+    # these are of size 3 to avoid float4 coalesce
+    r = a[:-1] + a[1:]
+
+    k = Linearizer(r.lazydata.schedule()[-1].ast)
+    k.upcast()
+    k.linearize()
+    num_loads = len([uop for uop in k.uops if uop.uop == UOps.LOAD])
+    assert num_loads <= 4, "more load uops than needed"
+    assert num_loads >= 4, "unexpected number of uops, maybe this test needs updating?"
+
+  def test_upcast_cse(self):
+    # when upcasting, within a subtree, there may be common expressions.
+
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled uses linearizer")
+
+    a, b = Tensor.randn(1).realize(), Tensor.randn(1).realize()
+    r = a.expand([2]) + b.expand([2])
+
+    k = Linearizer(r.lazydata.schedule()[-1].ast)
+    k.upcast()
+    k.linearize()
+    num_ops = len([uop for uop in k.uops if uop.uop == UOps.ALU])
+    assert num_ops <= 1, "more alu uops than needed"
+
+  def test_zero_fold(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled uses linearizer")
+
+    a, b = Tensor.randn(1).realize(), Tensor.randn(1).realize()
+    r = Tensor.stack([a, b])
+
+    k = Linearizer(r.lazydata.schedule()[-1].ast)
+    k.upcast()
+    k.linearize()
+    num_ops = len([uop for uop in k.uops if uop.uop == UOps.ALU])
+    assert num_ops == 0, "more alu uops than needed"
+
+  @unittest.skip("constant folding not supported yet")
+  def test_constant_fold(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled uses linearizer")
+
+    a, b = Tensor(2), Tensor(3)
+    r = a * b
+
+    k = Linearizer(r.lazydata.schedule()[-1][0])
+    k.linearize()
+    num_ops = len([uop for uop in k.uops if uop.uop in [UOps.LOAD, UOps.ALU]])
+    assert num_ops <= 0, "more load or alu uops than needed"
+
+  def test_tensor_cores(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled uses linearizer")
+    if Device.DEFAULT not in tensor_cores:
+      self.skipTest("No tensor cores for device")
+
+    for tc in tensor_cores[Device.DEFAULT]:
+      if tc.arch is not None and tc.arch != os.uname().machine: continue
+      a, b = Tensor.rand(tc.dims[0], tc.dims[2], dtype=tc.dtype_in), Tensor.rand(tc.dims[2], tc.dims[1], dtype=tc.dtype_in)
+      np_a, np_b = a.numpy(), b.numpy()
+      if tc.dtype_out != tc.dtype_in:
+        r = (a.reshape(tc.dims[0], 1, tc.dims[2]) * b.permute(1,0).reshape(1, tc.dims[1], tc.dims[2])).cast(tc.dtype_out).sum(axis=2)
+      else:
+        r = a @ b
+      realized_ast, _ = helper_realized_ast(r)
+      k = Linearizer(realized_ast)
+      k.apply_tensor_cores(1)
+      k.linearize()
+      assert len([uop for uop in k.uops if uop.uop == UOps.WMMA]) == 1, "tensor core not triggered"
+      np_c = np_a @ np_b
+      np.testing.assert_allclose(np_c, r.numpy(), atol=5e-3, rtol=1e-4)
+
+  def test_limit_dims_to_max_5d_global(self):
+    t = Tensor.rand(3, 4, 5, 6, 7).pad(((1, 1), (1, 1), (1, 1), (1, 1), (1, 1))) + 1
+    sched = [si for si in t.lazydata.schedule() if si.ast.op not in LoadOps]
+    assert len(sched) == 1
+    lin = Linearizer(sched[0].ast)
+    assert lin.full_shape[:lin.global_dims] == (5, 6, 7, 8, 9)
+    lin.limit_dims_to_max(global_max=[16, 16, 16], local_max=[16, 16, 16])
+
+def helper_realized_ast(r:Tensor):
+  s = r.lazydata.schedule()
+  run_schedule(s[:-1])  # run all kernels except the last one
+  # now all input LazyBuffers buffers in s[-1] should be realized
+  output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args())  # allocate an output buffer
+  return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]
+
+class TestFloat4(unittest.TestCase):
+  def setUp(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.supports_float4:
+      self.skipTest("Device does not support float4")
+
+  @staticmethod
+  def count_float4(k):
+    return (len([uop for uop in k.uops if uop.uop == UOps.LOAD and uop.dtype == dtypes._float4]),
+            len([uop for uop in k.uops if uop.uop == UOps.STORE and len(uop.vin) == 3 and uop.vin[2].dtype == dtypes._float4]))
+
+  # TODO: express opts below as auto opts
+
+  def test_float4_basic(self):
+    a = Tensor.rand(2, 8).realize()
+    b = Tensor.rand(2, 8).realize()
+    c = a + b
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.hand_coded_optimizations()
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (2, 1)
+
+  def test_float4_multidim(self):
+    a = Tensor.rand(2, 8).realize()
+    b = Tensor.rand(2, 8).realize()
+    c = a + b
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.shift_to(0, 4)  # float4 dimension
+    k.shift_to(0, 2, insert_before=k.shape_len-1)
+    k.upcast()
+    k.upcast()
+    k.local_dims += 1
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (4, 2)
+
+  def test_float4_unaligned_load(self):
+    a = Tensor.rand(9).realize().shrink(((1, 9),))
+    b = Tensor.rand(9).realize().shrink(((1, 9),))
+    c = a + b
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.hand_coded_optimizations()  # implicit trigger float4 dim
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (0, 1)
+
+  def test_float4_multidim_unaligned_load(self):
+    a = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
+    b = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
+    c = a + b
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.shift_to(len(k.full_unupcasted_shape)-1, 4)  # manual trigger float4 dim
+    k.upcast()
+    k.shift_to(len(k.full_unupcasted_shape)-1, 2, insert_before=k.shape_len-1)
+    k.upcast()
+    k.local_dims += 1
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (0, 2)
+
+  def test_float4_sometimes_unaligned(self):
+    a = Tensor.rand(1, 1, 8).realize()
+    b = Tensor.rand(1, 1, 5).realize().shrink(((0, 1), (0, 1), (1, 5)))
+    c = a.conv2d(b)
+    # only the first and last conv dot products are aligned in a, and b is never aligned, so no
+    # float4 should be emitted (the reduce axis of size 4 is the float4 axis here)
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.upcast()
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (0, 0)
+
+  def test_float4_multidim_sometimes_unaligned(self):
+    a = Tensor.rand(1, 1, 7).realize()
+    b = Tensor.rand(1, 1, 5).realize().shrink(((0, 1), (0, 1), (1, 5)))
+    c = a.conv2d(b)
+    # the first conv dot product is aligned in a. If we upcast the output and reduce
+    # dimension, then we could do float4 for only that one set of loads, but we currently
+    # don't.
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.upcast()
+    k.upcast()
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (0, 1)
+
+  def test_float4_noncontiguous(self):
+    a = Tensor.rand(4, 2).realize()
+    b = Tensor.rand(4, 2).realize()
+    c = a + b
+
+    # we will upcast the top axis of sz 4. they should not be coalesced into float4,
+    # since the top axis is not contiguous.
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.shift_to(0, 4, top=True)  # top axes are float4 axes
+    k.upcast()
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (0, 0)
+
+  def test_float4_expand(self):
+    a = Tensor.rand(9).realize().shrink(((1, 9),))
+    b = Tensor.rand(2).realize().reshape((2, 1)).expand((2,4)).reshape((8,))
+    c = a + b
+
+    # we will upcast the top axis of sz 4. they should not be coalesced into float4,
+    # since the top axis is not contiguous.
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.shift_to(0, 4)  # float4 axis
+    k.upcast()
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (0, 1)
+
+  def test_float4_heterogeneous(self):
+    a = Tensor.rand(8).realize()
+    b = Tensor.rand(9).realize().shrink(((1, 9),))
+    c = a + b
+
+    # should float4 b but not a
+
+    s = c.lazydata.schedule()[0]
+    k = Linearizer(s.ast)
+    k.shift_to(0, 4)  # float4 axis
+    k.upcast()
+    k.linearize()
+
+    assert TestFloat4.count_float4(k) == (1, 1)
+
+class TestHandCodedOpts(unittest.TestCase):
+  def setUp(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Device does not use linearizer")
+
+  def test_masked_upcast(self):
+    layer_1 = Tensor.cat(*[Tensor.rand(5) for _ in range(4)])
+    layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 20))
+
+    s = layer_2.lazydata.schedule()[-1]
+    k = Linearizer(s.ast)
+    k.hand_coded_optimizations()
+    assert len(k.bufs) == 6  # make sure all ops are done in one kernel
+    # masked upcast should upcast masked axis of size 7
+    # masked upcast should not upcast large (20) last axis
+    # float4/other hcopt shouldn't upcast last axis, since we already have 7 upcast, and the last axis is not very contiguous
+    assert k.upcasted == 1 and k.full_shape[-1] == 7
+
+  def test_masked_upcast_wino(self):
+    monster = Tensor.stack([Tensor.stack([Tensor.rand(16) for _ in range(6)]) for _ in range(6)])
+
+    s = monster.lazydata.schedule()[-1]
+    k = Linearizer(s.ast)
+    k.hand_coded_optimizations()
+    assert len(k.bufs) == 37  # make sure all ops are done in one kernel
+    # should upcast the two Tensor.stacks
+    assert k.upcasted >= 2 and k.full_shape[k.shape_len-k.upcasted:k.shape_len].count(6) == 2
+
+  def test_masked_upcast_wino_full(self):
+    old_wino = Tensor.wino
+    Tensor.wino = True
+    x,w = Tensor.rand(1,4,9,9, requires_grad=True).realize(), Tensor.rand(4,4,3,3, requires_grad=True).realize()
+    out = Tensor.conv2d(x,w, padding=1)
+    upcasts = []
+    # collect upcasts of tile transform kernels
+    for i, si in enumerate(out.lazydata.schedule()):
+      k = Linearizer(si.ast)
+      k.hand_coded_optimizations()
+      if k.reduceop is not None: continue  # not a tile transform kernel (there is a gemm reduce kernel)
+      if len(k.bufs) < 100: continue  # not a tile transform kernel (there's a permute kernel at the end)
+      upcasts.append(tuple(k.full_shape[k.shape_len - k.upcasted:k.shape_len]))
+    assert len(upcasts) == 3  # 3 transformation matrices
+    assert upcasts.count((6, 6)) == 2 and upcasts.count((4, 4)) == 1
+
+    out.mean().backward()
+    for si in x.grad.lazydata.schedule() + w.grad.lazydata.schedule():
+      k = Linearizer(si.ast)
+      k.hand_coded_optimizations()
+      k.linearize()
+      if len(k.bufs) < 20: continue  # not a tile transform kernel
+      # heuristic number to make sure that at least some upcasts but not too many upcasts are being done
+      assert 6 <= prod(k.full_shape[k.shape_len - k.upcasted:k.shape_len]) <= 49
+
+    Tensor.wino = old_wino
+
+  def test_masked_upcast_many(self):
+    layer_1 = Tensor.cat(Tensor.rand(3, 4), Tensor.rand(4, 4))
+    layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 7, 4))
+    layer_3 = Tensor.cat(layer_2.unsqueeze(0), Tensor.rand(6, 7, 7, 4))
+
+    s = layer_3.lazydata.schedule()[-1]
+    k = Linearizer(s.ast)
+    k.hand_coded_optimizations()
+    assert len(k.bufs) == 5  # make sure all ops are done in one kernel
+    # check that we don't do too many upcasts
+    assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49
+
+def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
+  wanna_output = None
+  realized_ast, real_bufs = helper_realized_ast(r)
+
+  def check_opt(opts, create_k, to_prg):
+    k = create_k()
+    if apply_tc:
+      k.apply_tensor_cores(1, opts)
+    else:
+      for opt in opts:
+        k.apply_opt(opt)
+    prg = to_prg(k)
+    real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+    prg.exec(real_bufs, force_wait=True)
+    np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
+
+  # Get baseline, which is not optimized at all.
+  k = Linearizer(realized_ast)
+  prg = Device[Device.DEFAULT].to_program(k)
+  prg.exec(real_bufs, force_wait=True)
+  wanna_output = real_bufs[0].toCPU().copy()
+
+  # Check correctness of handcoded optimiztions.
+  k = Linearizer(realized_ast)
+  k.hand_coded_optimizations()
+  prg = Device[Device.DEFAULT].to_program(k)
+  real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+  prg.exec(real_bufs, force_wait=True)
+  np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
+  for x in opts: # Check custom transformations if any.
+    check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_program)
+
+class TestLinearizerOpts(unittest.TestCase):
+  def test_local_and_grouped_reduce(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
+      self.skipTest("Only Compiled uses linearizer with locals and shared")
+
+    N = 128
+    Tensor.manual_seed(1882)
+    a = Tensor.rand(4, 4, N, N)
+    b = Tensor.rand(4, 4, N)
+    r = (b.sqrt() + ((a+1).sum(axis=3).exp()))
+    helper_linearizer_opt(r, [
+      [Opt(OptOps.LOCAL, 0, 2)],
+      [Opt(OptOps.LOCAL, 0, 8)],
+      [Opt(OptOps.LOCAL, 0, 16)], # Checking how it works with locals
+      [Opt(OptOps.GROUPTOP, 0, 2)],
+      [Opt(OptOps.GROUPTOP, 0, 32)],
+      [Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with grouped reduce
+      [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2)],
+      [Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.GROUPTOP, 0, 16)],
+      [Opt(OptOps.LOCAL, 0, 32), Opt(OptOps.GROUPTOP, 0, 2)],
+      [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with locals + grouped reduce
+      [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with locals + grouped reduce + upcasts
+    ])
+
+  def test_upcasts(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled uses linearizer")
+
+    N = 16
+    Tensor.manual_seed(1772)
+    a = Tensor.rand(N, N)
+    b = Tensor.rand(N, N)
+    r = (a+b).sqrt() * ((a+1).exp())
+    helper_linearizer_opt(r, [
+      [Opt(OptOps.UPCAST, 0, 2)],
+      [Opt(OptOps.UPCAST, 0, 4)],
+      [Opt(OptOps.UPCAST, 0, 8)], # Checking how it works with upcasts
+    ])
+
+  def test_full_upcast(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled):
+      self.skipTest("Only Compiled uses linearizer")
+
+    Tensor.manual_seed(1772)
+    a = Tensor.rand(4)
+    b = Tensor.rand(4)
+    r = (a+b).sqrt() * ((a+1).exp())
+    helper_linearizer_opt(r, [
+      [Opt(OptOps.UPCAST, 0, 4)], # Checking how it works with upcasts
+    ])
+
+  def test_matmul(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
+      self.skipTest("Only Compiled uses linearizer with locals and shared")
+
+    N = 128
+    Tensor.manual_seed(1552)
+    a = Tensor.rand(N, N)
+    b = Tensor.rand(N, N)
+    r = a@b
+    helper_linearizer_opt(r, [
+      [Opt(OptOps.UPCAST, 0, 2)],
+      [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # Checking how it works with upcasts
+      [Opt(OptOps.LOCAL, 0, 2)],
+      [Opt(OptOps.LOCAL, 1, 32)],
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4)],
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 32)],
+      [Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.LOCAL, 1, 8)], # Checking how it works with locals
+      [Opt(OptOps.GROUPTOP, 0, 2)],
+      [Opt(OptOps.GROUPTOP, 0, 32)],
+      [Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.UNROLL, 0, 4)], # Checking how it works with grouped_reduce
+      [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 32)],
+      [Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 32)],
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 4)], # Checking how it works with local+grouped_reduce
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 2)], # Checking all together
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 8)], # Full global upcast + local
+    ])
+
+  def test_double_reduce(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
+      self.skipTest("Only Compiled uses linearizer with locals and shared")
+
+    N = 128
+    Tensor.manual_seed(1552)
+    a = Tensor.rand(8, N, 8, N)
+    r = a.sum(axis=(1,3))
+    helper_linearizer_opt(r, [
+      # openCL / GPU=1 is 256 max threads
+      [Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)],
+      [Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce.
+      [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)],
+      [Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2)],
+      [Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 64)], # Checking how it works with 2 grouped_reduces.
+      [Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 0, 4)],
+      [Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 2, 4)], # Checking how it works with 2 grouped_reduces + upcasts.
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4)],
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals.
+      [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2)],
+      [Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals.
+      [Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 0, 2)], # No globals
+    ])
+
+  def test_tensor_core_opts(self):
+    if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local:
+      self.skipTest("Only Compiled uses linearizer with locals")
+    if Device.DEFAULT not in tensor_cores:
+      self.skipTest("No tensor cores for device")
+
+    N = 128
+    Tensor.manual_seed(1552)
+    a = Tensor.rand(N, N)
+    b = Tensor.rand(N, N)
+    r = a@b
+    helper_linearizer_opt(r, [
+      [Opt(OptOps.UPCAST, 0, 4)],
+      [Opt(OptOps.UPCAST, 1, 4)],
+      [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # check upcasts
+      [Opt(OptOps.UNROLL, 0, 2)], # check last unroll
+      [Opt(OptOps.LASTLOCAL, 0, 4)], # check last local
+      [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2)], # check combo of last unroll and last local
+      [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 2)],
+      [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4)],
+      [Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.LASTLOCAL, 0, 2)],
+      # [Opt(OptOps.GROUP, 0, 2)] # doesn't work because group_for_reduce dims become early locals (conflicting with TC)
+    ], apply_tc=True)
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_linearizer_failures.py
+++ b/tinygrad_repo/test/test_linearizer_failures.py
@@ -0,0 +1,21 @@
+import unittest
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.ops import Device
+
+# stuff needed to unpack a kernel
+from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
+from tinygrad.helpers import dtypes
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.shape.view import View
+from tinygrad.shape.symbolic import Variable
+inf, nan = float('inf'), float('nan')
+
+class TestLinearizerFailures(unittest.TestCase):
+  @unittest.skip("this is currently failing")
+  def test_failure_1(self):
+    ast = LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 16), strides=(16, 1, 0), offset=0, mask=None, contiguous=False),)))),), arg=(32, 16, 1)), LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=2, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),))))), arg=None), LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),))))), arg=None)
+    lin = Linearizer(ast)
+    prg = Device[Device.DEFAULT].to_program(lin)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_net_speed.py
+++ b/tinygrad_repo/test/test_net_speed.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+import time
+import cProfile
+import pstats
+import unittest
+import torch
+from tinygrad.tensor import Tensor, Device
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+def start_profile():
+  import time
+  pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
+  pr.enable()
+  return pr
+
+def stop_profile(pr, sort='cumtime', frac=0.2):
+  pr.disable()
+  ps = pstats.Stats(pr)
+  ps.strip_dirs()
+  ps.sort_stats(sort)
+  ps.print_stats(frac)
+
+class TestConvSpeed(unittest.TestCase):
+
+  def test_mnist(self):
+    # https://keras.io/examples/vision/mnist_convnet/
+    conv = 3
+    inter_chan, out_chan = 32, 64
+
+    # ****** torch baseline *******
+
+    torch.backends.mkldnn.enabled = False
+
+    conv = 3
+    inter_chan, out_chan = 32, 64
+    c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
+    c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
+    l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
+
+    c2d = torch.nn.functional.conv2d
+    mp = torch.nn.MaxPool2d((2,2))
+    lsm = torch.nn.LogSoftmax(dim=1)
+
+    cnt = 5
+    fpt, bpt = 0.0, 0.0
+    for i in range(cnt):
+      et0 = time.time()
+      x = torch.randn(128, 1, 28, 28, requires_grad=True)
+      x = mp(c2d(x,c1).relu())
+      x = mp(c2d(x,c2).relu())
+      x = x.reshape(x.shape[0], -1)
+      out = lsm(x.matmul(l1))
+      out = out.mean()
+      et1 = time.time()
+      out.backward()
+      et2 = time.time()
+      fpt += (et1-et0)
+      bpt += (et2-et1)
+
+    fpt_baseline = (fpt*1000/cnt)
+    bpt_baseline = (bpt*1000/cnt)
+    print("torch forward pass:  %.3f ms" % fpt_baseline)
+    print("torch backward pass: %.3f ms" % bpt_baseline)
+
+    # ****** tinygrad compare *******
+
+    c1 = Tensor(c1.detach().numpy(), requires_grad=True)
+    c2 = Tensor(c2.detach().numpy(), requires_grad=True)
+    l1 = Tensor(l1.detach().numpy(), requires_grad=True)
+
+    cnt = 5
+    fpt, bpt = 0.0, 0.0
+    for i in range(1+cnt):
+      et0 = time.time()
+      x = Tensor.randn(128, 1, 28, 28)
+      x = x.conv2d(c1).relu().avg_pool2d()
+      x = x.conv2d(c2).relu().max_pool2d()
+      x = x.reshape(shape=(x.shape[0], -1))
+      out = x.dot(l1).log_softmax()
+      out = out.mean()
+      out.realize()
+      et1 = time.time()
+      out.backward()
+      [x.grad.realize() for x in [c1, c2, l1]]
+      et2 = time.time()
+      if i == 0:
+        pr = start_profile()
+      else:
+        fpt += (et1-et0)
+        bpt += (et2-et1)
+
+    stop_profile(pr, sort='time')
+    fpt = (fpt*1000/cnt)
+    bpt = (bpt*1000/cnt)
+    print("forward pass:  %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
+    print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_nn.py
+++ b/tinygrad_repo/test/test_nn.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from extra.utils import WINDOWS
+from tinygrad.helpers import CI
+from tinygrad.jit import TinyJit
+from tinygrad.tensor import Tensor, Device
+from tinygrad.nn import BatchNorm2d, Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, GroupNorm, LayerNorm, LayerNorm2d, Embedding, InstanceNorm
+import torch
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda]
+
+class TestNN(unittest.TestCase):
+  def test_sparse_cat_cross_entropy(self):
+    input = torch.randn(3, 5)
+    target = torch.empty(3, dtype=torch.long).random_(5)
+    loss_fun = torch.nn.CrossEntropyLoss(reduction='mean')
+    loss = loss_fun(input, target)
+
+    input_tiny = Tensor(input.detach().numpy())
+    taret_tiny = Tensor(target.detach().numpy())
+    loss_tiny = input_tiny.sparse_categorical_crossentropy(taret_tiny)
+
+    np.testing.assert_allclose(loss_tiny.numpy(), loss.detach().numpy(), atol=1e-5, rtol=1e-6)
+
+  def test_batchnorm2d(self, training=False):
+    szs = [4, 8, 16, 32]
+    for sz in szs:
+      # create in tinygrad
+      Tensor.training = training
+      bn = BatchNorm2d(sz, eps=1e-5, track_running_stats=training)
+      bn.weight = Tensor.randn(sz)
+      bn.bias = Tensor.randn(sz)
+      bn.running_mean = Tensor.randn(sz)
+      bn.running_var = Tensor.randn(sz)
+      bn.running_var.numpy()[bn.running_var.numpy() < 0] = 0
+
+      # create in torch
+      with torch.no_grad():
+        tbn = torch.nn.BatchNorm2d(sz).eval()
+        tbn.training = training
+        tbn.weight[:] = torch.tensor(bn.weight.numpy())
+        tbn.bias[:] = torch.tensor(bn.bias.numpy())
+        tbn.running_mean[:] = torch.tensor(bn.running_mean.numpy())
+        tbn.running_var[:] = torch.tensor(bn.running_var.numpy())
+
+      np.testing.assert_allclose(bn.running_mean.numpy(), tbn.running_mean.detach().numpy(), rtol=1e-5, atol=1e-6)
+      np.testing.assert_allclose(bn.running_var.numpy(), tbn.running_var.detach().numpy(), rtol=1e-5, atol=1e-6)
+
+      # trial
+      inn = Tensor.randn(2, sz, 3, 3)
+
+      # in tinygrad
+      outt = bn(inn)
+
+      # in torch
+      toutt = tbn(torch.tensor(inn.numpy()))
+
+      # close
+      np.testing.assert_allclose(outt.numpy(), toutt.detach().numpy(), rtol=5e-4, atol=1e-6)
+
+      np.testing.assert_allclose(bn.running_mean.numpy(), tbn.running_mean.detach().numpy(), rtol=1e-5, atol=1e-6)
+
+      np.testing.assert_allclose(bn.running_var.numpy(), tbn.running_var.detach().numpy(), rtol=1e-5, atol=1e-6)
+
+  def test_batchnorm2d_training(self):
+    self.test_batchnorm2d(True)
+
+  def test_linear(self):
+    def _test_linear(x):
+
+      # create in tinygrad
+      model = Linear(in_dim, out_dim)
+      z = model(x)
+
+      # create in torch
+      with torch.no_grad():
+        torch_layer = torch.nn.Linear(in_dim, out_dim).eval()
+        torch_layer.weight[:] = torch.tensor(model.weight.numpy(), dtype=torch.float32)
+        torch_layer.bias[:] = torch.tensor(model.bias.numpy(), dtype=torch.float32)
+        torch_x = torch.tensor(x.numpy(), dtype=torch.float32)
+        torch_z = torch_layer(torch_x)
+
+      # test
+      np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
+
+    BS, T, in_dim, out_dim = 4, 2, 8, 16
+    _test_linear(Tensor.randn(BS, in_dim))
+    _test_linear(Tensor.randn(BS, T, in_dim)) # test with more dims
+
+  def test_conv1d(self):
+    BS, C1, W = 4, 16, 224//4
+    C2, K, S, P = 64, 7, 2, 1
+
+    # create in tinygrad
+    layer = Conv1d(C1, C2, kernel_size=K, stride=S, padding=P)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.Conv1d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.uniform(BS, C1, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
+
+  def test_conv2d(self):
+    BS, C1, H, W = 4, 16, 224//4, 224//4
+    C2, K, S, P = 64, 7, 2, 1
+
+    # create in tinygrad
+    layer = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.Conv2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.uniform(BS, C1, H, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
+
+  @unittest.skipIf(Device.DEFAULT != "TORCH", "Takes too long to compile for Compiled backends")
+  def test_conv2d_winograd(self):
+    BS, C1, H, W = 2, 8, 16, 16
+    C2, K, S, P = 8, 3, 1, 1
+
+    old_wino = Tensor.wino
+    Tensor.wino = True
+
+    # create in tinygrad
+    layer = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
+    layer.weight.requires_grad = True
+    layer.bias.requires_grad = True
+
+    # create in torch
+    torch_layer = torch.nn.Conv2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
+    torch_layer.weight = torch.nn.Parameter(torch.tensor(layer.weight.numpy(), dtype=torch.float32))
+    torch_layer.bias = torch.nn.Parameter(torch.tensor(layer.bias.numpy(), dtype=torch.float32))
+
+    # test
+    x = Tensor.uniform(BS, C1, H, W, requires_grad=True)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy(), requires_grad=True)
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
+
+    m = z.mean()
+    m.backward()
+    gw = layer.weight.grad.realize()
+    gb = layer.bias.grad.realize()
+    gx = x.grad.realize()
+
+    torch_z.mean().backward()
+    np.testing.assert_allclose(gw.numpy(), torch_layer.weight.grad.numpy(), atol=5e-4, rtol=1e-5)
+    np.testing.assert_allclose(gb.numpy(), torch_layer.bias.grad.numpy(), atol=5e-4, rtol=1e-5)
+    np.testing.assert_allclose(gx.numpy(), torch_x.grad.numpy(), atol=5e-4, rtol=1e-5)
+
+    Tensor.wino = old_wino
+
+  @unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
+  def test_conv_transpose1d(self):
+    BS, C1, W = 4, 16, 224//4
+    C2, K, S, P = 64, 7, 2, 1
+
+    # create in tinygrad
+    layer = ConvTranspose1d(C1, C2, kernel_size=K, stride=S, padding=P)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.ConvTranspose1d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.uniform(BS, C1, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
+
+  @unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
+  def test_conv_transpose2d(self):
+    BS, C1, H, W = 4, 16, 224//4, 224//4
+    C2, K, S, P = 64, 7, 2, 1
+
+    # create in tinygrad
+    layer = ConvTranspose2d(C1, C2, kernel_size=K, stride=S, padding=P)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.ConvTranspose2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.uniform(BS, C1, H, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
+
+  def test_groupnorm(self):
+    BS, H, W, C, G = 20, 10, 10, 6, 3
+
+    # create in tinygrad
+    layer = GroupNorm(G, C)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.GroupNorm(G, C).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.randn(BS, C, H, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+  def test_layernorm(self):
+    N, C, H, W = 20, 5, 10, 10
+
+    # create in tinygrad
+    layer = LayerNorm([H, W])
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.LayerNorm([H, W]).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.randn(N, C, H, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+  def test_layernorm_2d(self):
+    N, C, H, W = 20, 5, 10, 10
+
+    # create in tinygrad
+    layer = LayerNorm2d(C)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.LayerNorm([C]).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.randn(N, C, H, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x.permute(0,2,3,1)).permute(0,3,1,2)
+
+  def test_instancenorm_2d(self):
+    N, C, H, W = 20, 5, 10, 10
+
+    # create in tinygrad
+    layer = InstanceNorm(C)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.InstanceNorm2d(C, affine=True).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.randn(N, C, H, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+  def test_instancenorm_3d(self):
+    N, C, D, H, W = 20, 5, 3, 10, 10
+
+    # create in tinygrad
+    layer = InstanceNorm(C)
+
+    # create in torch
+    with torch.no_grad():
+      torch_layer = torch.nn.InstanceNorm3d(C, affine=True).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+      torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor.randn(N, C, D, H, W)
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy())
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
+
+  def test_embedding(self):
+    B, T, C, VS = 4, 10, 20, 28
+
+    # create in tinygrad
+    layer = Embedding(VS, C)
+
+    with torch.no_grad():
+      torch_layer = torch.nn.Embedding(VS, C).eval()
+      torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
+
+    # test
+    x = Tensor(np.random.randint(0, VS, (B, T)).astype(np.float32))
+    z = layer(x)
+    torch_x = torch.tensor(x.numpy().astype(np.int32))
+    torch_z = torch_layer(torch_x)
+    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
+
+    # test with jit enabled
+    @TinyJit
+    def layer_jit(x):
+      return layer(x).realize()
+
+    for _ in range(3):
+      x = Tensor(np.random.randint(0, VS, (B, T)).astype(np.float32))
+      z = layer_jit(x)
+      torch_x = torch.tensor(x.numpy().astype(np.int32))
+      torch_z = torch_layer(torch_x)
+      np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_ops.py
+++ b/tinygrad_repo/test/test_ops.py
--- a/tinygrad_repo/test/test_optim.py
+++ b/tinygrad_repo/test/test_optim.py
@@ -0,0 +1,98 @@
+import numpy as np
+import torch
+import unittest
+from tinygrad.tensor import Tensor
+from tinygrad.nn.optim import Adam, SGD, AdamW
+import pytest
+
+pytestmark = pytest.mark.exclude_cuda
+
+np.random.seed(1337)
+x_init = np.random.randn(1,4).astype(np.float32)
+W_init = np.random.randn(4,4).astype(np.float32)
+m_init = np.random.randn(1,4).astype(np.float32)
+
+class TinyNet:
+  def __init__(self, tensor):
+    self.x = tensor(x_init.copy(), requires_grad=True)
+    self.W = tensor(W_init.copy(), requires_grad=True)
+    self.m = tensor(m_init.copy())
+
+  def forward(self):
+    out = self.x.matmul(self.W).relu()
+    # print(out.detach().numpy())
+    out = out.log_softmax(1)
+    out = out.mul(self.m).add(self.m).sum()
+    return out
+
+def step(tensor, optim, steps=1, kwargs={}):
+  net = TinyNet(tensor)
+  optim = optim([net.x, net.W], **kwargs)
+  for _ in range(steps):
+    out = net.forward()
+    optim.zero_grad()
+    out.backward()
+    optim.step()
+  return net.x.detach().numpy(), net.W.detach().numpy()
+
+class TestOptim(unittest.TestCase):
+
+  def _test_optim(self, tinygrad_optim, torch_optim, steps, opts, atol, rtol):
+    for x,y in zip(step(Tensor, tinygrad_optim, steps, kwargs=opts),
+                   step(torch.tensor, torch_optim, steps, kwargs=opts)):
+      np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
+
+  def _test_sgd(self, steps, opts, atol, rtol): self._test_optim(SGD, torch.optim.SGD, steps, opts, atol, rtol)
+  def _test_adam(self, steps, opts, atol, rtol): self._test_optim(Adam, torch.optim.Adam, steps, opts, atol, rtol)
+  def _test_adamw(self, steps, opts, atol, rtol): self._test_optim(AdamW, torch.optim.AdamW, steps, opts, atol, rtol)
+
+  def test_sgd(self): self._test_sgd(1, {'lr': 0.001}, 1e-6, 0)
+  def test_sgd_high_lr(self): self._test_sgd(1, {'lr': 10}, 1e-6, 1e-5)
+  def test_sgd_wd(self): self._test_sgd(1, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
+  def test_sgd_high_lr_wd(self): self._test_sgd(1, {'lr': 10, 'weight_decay': 0.1}, 1e-6, 1e-5)
+
+  def test_multistep_sgd(self): self._test_sgd(10, {'lr': 0.001}, 1e-6, 0)
+  def test_multistep_sgd_high_lr(self): self._test_sgd(10, {'lr': 10}, 1e-6, 3e-4)
+  def test_multistep_sgd_wd(self): self._test_sgd(10, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
+  def test_multistep_sgd_high_lr_wd(self): self._test_sgd(10, {'lr': 9, 'weight_decay': 0.1}, 1e-6, 3e-4)
+
+  def test_multistep_sgd_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9}, 1e-6, 0)
+  def test_multistep_sgd_high_lr_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9}, 1e-5, 3e-4)
+  def test_multistep_sgd_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-6, 0)
+  def test_multistep_sgd_high_lr_momentum_wd(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-5, 3e-4)
+
+  def test_multistep_sgd_nesterov_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True}, 1e-5, 0)
+  def test_multistep_sgd_high_lr_nesterov_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'nesterov': True}, 1e-5, 3e-4)
+  def test_multistep_sgd_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 0)
+  def test_multistep_sgd_high_lr_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 9, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 3e-4)
+
+  def test_adam(self): self._test_adam(1, {'lr': 0.001}, 1e-5, 0)
+  def test_adam_high_lr(self): self._test_adam(1, {'lr': 10}, 1e-4, 1e-4)
+  def test_adamw(self): self._test_adamw(1, {'lr': 0.001}, 1e-5, 0)
+  def test_adamw_high_lr(self): self._test_adamw(1, {'lr': 10}, 1e-4, 1e-4)
+
+  def test_multistep_adam(self): self._test_adam(10, {'lr': 0.001}, 1e-5, 0)
+  def test_multistep_adam_high_lr(self): self._test_adam(10, {'lr': 10}, 2e-4, 5e-4)
+
+  def test_multistep_adamw(self): self._test_adamw(10, {'lr': 0.001}, 1e-5, 0)
+  def test_multistep_adamw_high_lr(self): self._test_adamw(10, {'lr': 10}, 5e-4, 2e-3)
+
+  def test_duped_weights(self):
+    for Opt in [Adam, AdamW, SGD]:
+      losses = []
+      for i in range(2):
+        w = Tensor(x_init.copy())
+        opt = Opt([w], lr=0.1) if i == 0 else Opt([w, w], lr=0.1)
+
+        loss = None
+        for _ in range(3):
+          loss = w.sum()
+          opt.zero_grad()
+          loss.backward()
+          opt.step()
+        losses.append(loss.numpy())
+
+      np.testing.assert_allclose(losses[0], losses[1], atol=1e-4, rtol=0)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_randomness.py
+++ b/tinygrad_repo/test/test_randomness.py
@@ -0,0 +1,115 @@
+import math
+import unittest
+import numpy as np
+import torch
+from tinygrad.tensor import Tensor
+import tinygrad.nn as nn
+import pytest
+from tinygrad.helpers import dtypes
+from functools import partial
+
+pytestmark = pytest.mark.webgpu
+
+# https://gist.github.com/devries/11405101
+def ksprob(a):
+  fac, total, termbf = 2.0, 0.0, 0.0
+  a2 = -2.0 * a * a
+  for j in range(1, 101):
+    term = fac * math.exp(a2 * j * j)
+    total += term
+    if math.fabs(term) <= 0.001 * termbf or math.fabs(term) <= 1e-8 * total:
+      return total
+    fac = -fac
+    termbf = math.fabs(term)
+  return 1.0
+
+def kstest(l1, l2):
+  n1, n2 = len(l1), len(l2)
+  l1.sort()
+  l2.sort()
+  j1, j2, d, fn1, fn2 = 0, 0, 0.0, 0.0, 0.0
+  while j1 < n1 and j2 < n2:
+    d1, d2 = l1[j1], l2[j2]
+    if d1 <= d2:
+      fn1 = (float(j1) + 1.0) / float(n1)
+      j1 += 1
+    if d2 <= d1:
+      fn2 = (float(j2) + 1.0) / float(n2)
+      j2 += 1
+    dtemp = math.fabs(fn2 - fn1)
+    if dtemp > d:
+      d = dtemp
+  ne = float(n1 * n2) / float(n1 + n2)
+  nesq = math.sqrt(ne)
+  prob = ksprob((nesq + 0.12 + 0.11 / nesq) * d)
+  return prob
+
+def equal_distribution(tiny_func, torch_func=None, numpy_func=None, shape=(20, 23), alpha=0.05):
+  Tensor.manual_seed(1337)
+  torch.manual_seed(1337)
+  np.random.seed(1337)
+  assert not (torch_func is None and numpy_func is None), "no function to compare with"
+  x = tiny_func(*shape).numpy().flatten()
+  if numpy_func is not None: y = numpy_func(shape).flatten()
+  if torch_func is not None: z = torch_func(shape).numpy().flatten()
+  return (numpy_func is None or kstest(x, y) >= alpha) and (torch_func is None or kstest(x, z) >= alpha)
+
+def normal_test(func, shape=(20, 23), alpha=0.05): return equal_distribution(func, numpy_func=lambda x: np.random.randn(*x), shape=shape, alpha=alpha)
+
+class TestRandomness(unittest.TestCase):
+  def test_rand(self):
+    self.assertFalse(normal_test(Tensor.rand))
+    self.assertTrue(equal_distribution(Tensor.rand, torch.rand, lambda x: np.random.rand(*x)))
+
+  def test_randn(self):
+    self.assertTrue(normal_test(Tensor.randn))
+    self.assertTrue(equal_distribution(Tensor.randn, torch.randn, lambda x: np.random.randn(*x)))
+
+  def test_normal(self):
+    self.assertTrue(normal_test(Tensor.normal))
+    self.assertTrue(equal_distribution(Tensor.normal, lambda x: torch.nn.init.normal_(torch.empty(x), mean=0, std=1), lambda x: np.random.normal(loc=0, scale=1, size=x)))
+
+  def test_uniform(self):
+    self.assertFalse(normal_test(Tensor.uniform))
+    self.assertTrue(equal_distribution(Tensor.uniform, lambda x: torch.nn.init.uniform_(torch.empty(x)), lambda x: np.random.uniform(size=x)))
+    self.assertTrue(equal_distribution(partial(Tensor.uniform, low=-100, high=100, dtype=dtypes.int32), numpy_func=lambda x: np.random.randint(low=-100, high=100, size=x)))
+
+  def test_scaled_uniform(self):
+    self.assertFalse(normal_test(Tensor.scaled_uniform))
+    self.assertTrue(equal_distribution(Tensor.scaled_uniform, lambda x: torch.nn.init.uniform_(torch.empty(x), a=-1, b=1) / math.sqrt(math.prod(x)), lambda x: np.random.uniform(-1, 1, size=x) / math.sqrt(math.prod(x))))
+
+  def test_glorot_uniform(self):
+    self.assertFalse(normal_test(Tensor.glorot_uniform))
+    self.assertTrue(equal_distribution(Tensor.glorot_uniform, lambda x: torch.nn.init.xavier_uniform_(torch.empty(x)), lambda x: np.random.uniform(-1, 1, size=x) * math.sqrt(6 / (x[0] + math.prod(x[1:])))))
+
+  def test_kaiming_uniform(self):
+    Tensor.manual_seed(1337)
+    torch.manual_seed(1337)
+    np.random.seed(1337)
+    for shape in [(128, 64, 3, 3), (20, 24)]:
+      self.assertTrue(equal_distribution(Tensor.kaiming_uniform, lambda x: torch.nn.init.kaiming_uniform_(torch.empty(x)), shape=shape))
+
+  def test_kaiming_normal(self):
+    Tensor.manual_seed(1337)
+    torch.manual_seed(1337)
+    np.random.seed(1337)
+    for shape in [(128, 64, 3, 3), (20, 24)]:
+      self.assertTrue(equal_distribution(Tensor.kaiming_normal, lambda x: torch.nn.init.kaiming_normal_(torch.empty(x)), shape=shape))
+
+  def test_conv2d_init(self):
+    params = (128, 256, (3,3))
+    assert equal_distribution(lambda *_: nn.Conv2d(*params).weight, lambda _: torch.nn.Conv2d(*params).weight.detach())
+    assert equal_distribution(lambda *_: nn.Conv2d(*params).bias, lambda _: torch.nn.Conv2d(*params).bias.detach())
+
+  def test_linear_init(self):
+    params = (64, 64)
+    assert equal_distribution(lambda *_: nn.Linear(*params).weight, lambda _: torch.nn.Linear(*params).weight.detach())
+    assert equal_distribution(lambda *_: nn.Linear(*params).bias, lambda _: torch.nn.Linear(*params).bias.detach())
+
+  def test_bn_init(self):
+    params = (64,)
+    assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).weight, lambda _: torch.nn.BatchNorm2d(*params).weight.detach())
+    assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).bias, lambda _: torch.nn.BatchNorm2d(*params).bias.detach())
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/test_schedule.py
+++ b/tinygrad_repo/test/test_schedule.py
@@ -0,0 +1,335 @@
+# this will be the new test_ops for the next level
+# schedule confirms the right things are capable of fusing
+# NOTE: this has overlap with external_test_opt.py
+
+import unittest
+from typing import List, Optional
+from tinygrad.tensor import Tensor
+from tinygrad.ops import LoadOps, Device, Compiled
+from tinygrad.helpers import DEBUG, dtypes
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.graph import log_schedule_item, print_tree
+from tinygrad import nn
+
+def check_schedule(t:Tensor, allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_loadops=True):
+  seen = set()
+  if to_prerealize:
+    for pre in to_prerealize:
+      for s in pre.lazydata.schedule(seen.copy()):
+        log_schedule_item(s)
+        seen.add(s.out)
+  sched = t.lazydata.schedule(seen)
+  for s in sched: log_schedule_item(s)
+  if filter_loadops: sched = [s for s in sched if s.ast.op not in LoadOps]
+  if len(sched) != allowed: print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}")
+  if len(sched) != allowed or DEBUG >= 3:
+    for i, s in enumerate(sched):
+      print("op", i)
+      print_tree(s.ast)
+  assert len(sched) == allowed
+  # test the (non loadops) ops linearize
+  for s in sched:
+    if s.ast.op in LoadOps: continue
+    l = Linearizer(s.ast)
+    l.hand_coded_optimizations()
+    l.linearize()
+
+class TestSchedule(unittest.TestCase):
+  def test_basic_binop_fusion(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = Tensor.empty(10)
+    d = a+b+c
+    check_schedule(d, 1)
+
+  def test_basic_binop_fusion_deep(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = Tensor.empty(10)
+    d = Tensor.empty(10)
+    e = a+b+c+d
+    check_schedule(e, 1)
+
+  def test_mulacc_fusion(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = (a*b).sum()
+    check_schedule(c, 1)
+
+  def test_mulacc_relu_fusion(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = (a*b).sum().relu()
+    check_schedule(c, 1)
+
+  def test_binop_reshape_fusion(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = Tensor.empty(5,2)
+    d = (a+b).reshape(5,2)+c
+    check_schedule(d, 1)
+
+  def test_binop_permute_fusion(self):
+    a = Tensor.empty(2,5)
+    b = Tensor.empty(2,5)
+    c = Tensor.empty(5,2)
+    d = (a+b).permute(1,0)+c
+    check_schedule(d, 1)
+
+  @unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or Device.DEFAULT == "LLVM", "only test for compiled backends")
+  def test_constants_are_embedded(self):
+    a = Tensor.empty(3,3) * 2
+    check_schedule(a, 2, filter_loadops=False)
+
+  def test_binop_elu_fusion(self):
+    a = Tensor.empty(10)
+    b = a.elu()
+    check_schedule(b, 1)
+
+  def test_binop_reshape_reduce_fusion(self):
+    a = Tensor.empty(100)
+    b = Tensor.empty(100)
+    c = (a+b).reshape(10, 10).sum(axis=0, keepdim=True)
+    check_schedule(c, 1)
+
+  def test_reduce_reshape_binop_fusion(self):
+    a = Tensor.empty(10,10)
+    b = Tensor.empty(10)
+    c = a.sum(axis=0) + b
+    check_schedule(c, 1)
+
+  @unittest.skip("not pushing permutes through reduces")
+  def test_reduce_permute_binop_fusion(self):
+    a = Tensor.empty(10,10,10)
+    b = Tensor.empty(10,10,1)
+    c = a.sum(axis=0, keepdim=True).permute(2,1,0) + b
+    check_schedule(c, 1)
+
+  def test_binop_early_reshape_reduce_fusion(self):
+    a = Tensor.empty(100)
+    b = Tensor.empty(100)
+    c = Tensor.empty(10,10)
+    d = ((a+b).reshape(10,10) + c).sum(axis=0)
+    check_schedule(d, 1)
+
+  def test_diamond_folded(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = Tensor.empty(10)
+    d = Tensor.empty(10)
+    ab = a+b
+    e = (ab+c) + (ab+d)
+    check_schedule(e, 1)
+
+  def test_cache_binaryop(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = a+b
+    d = a+b
+    check_schedule(d, 0, [c])
+
+  @unittest.skip("failing in old lazy")
+  def test_cache_binaryop_reshaped(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = a+b
+    d = a.reshape(10,1)+b.reshape(10,1)
+    check_schedule(d, 0, [c])
+
+  def test_cache_binaryop_transpose(self):
+    a = Tensor.empty(10,10)
+    b = Tensor.empty(10,10)
+    c = (a.T*b.T).T #.contiguous()
+    d = a*b
+    check_schedule(d, 0, [c])
+
+  def test_cache_two_reduceops(self):
+    a = Tensor.empty(10)
+    b = a.sum()
+    c = a.sum()
+    bc = b+c
+    check_schedule(bc, 1)
+
+  def test_fold_double_unary(self):
+    y = Tensor.empty(2)
+    out = y.sum(keepdim=True).sqrt().__neg__()
+    check_schedule(out, 1)
+
+  #@unittest.skip("may want to reconsider this")
+  def test_fold_batchnorm(self):
+    with Tensor.train():
+      img = Tensor.empty(1,32,4,4)
+      bn = nn.BatchNorm2d(32, track_running_stats=False)
+      out = bn(img)
+      check_schedule(out, 3)
+
+  def test_fold_conv_relu(self):
+    c1 = nn.Conv2d(3,16,3)
+
+    # run
+    img = Tensor.ones(2,3,64,64)
+    out = c1(img).relu()
+    check_schedule(out, 1, [c1.weight, c1.bias])
+
+  def test_fold_conv_elu(self):
+    c1 = nn.Conv2d(3,16,3)
+
+    # run
+    img = Tensor.rand(2,3,64,64)
+    out = c1(img).elu()
+    check_schedule(out, 1, [c1.weight, c1.bias])
+
+  def test_two_sum(self):
+    img = Tensor.empty(64,64)
+    x = (img.sum(0) + img.sum(1))
+    out = x.relu()
+    del x    # is 3 without this
+    check_schedule(out, 2)
+
+  @unittest.skip("failing in old lazy")
+  def test_push_permute_through_reshape(self):
+    a = Tensor.empty(16,16)
+    b = Tensor.empty(16,16)
+    c = (a+b).reshape(4,4,4,4).permute(2,3,0,1).contiguous()
+    check_schedule(c, 1)
+
+  @unittest.skip("failing in old lazy")
+  def test_push_permute_through_reshape_alt(self):
+    a = Tensor.empty(4,4,4,4)
+    b = Tensor.empty(4,4,4,4)
+    c = (a+b).reshape(16,16).permute(1,0).contiguous()
+    check_schedule(c, 1)
+
+  def test_no_binop_rerun(self):
+    a = Tensor.empty(16)
+    b = Tensor.empty(16)
+    c = a+b
+    d = (a+b).reshape(16,1)
+    check_schedule(d, 0, [c])
+
+  def test_multi_permute_should_collapse(self):
+    a = Tensor.empty(4,4,4,4)
+    b = Tensor.empty(16)
+    c = a.sum((0,1)).cast(dtypes.float16).permute(1,0).reshape(4,4,1).permute(1,0,2).reshape(16) + b
+    check_schedule(c, 1)
+
+  @unittest.skip("failing in old lazy")
+  def test_fancy_reshape_fusion(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = a+b
+    d = a.reshape(10,1)+b.reshape(10,1)
+    out = c.sum() + d.sum()
+    check_schedule(out, 1)
+
+  # NOTE: for this to pass, LazyViews must be children of LazyBuffers so the (a+b) runs first
+  @unittest.skip("not real world")
+  def test_children_dont_push(self):
+    a = Tensor.empty(10, 10, 1)
+    b = Tensor.empty(10, 10, 1)
+    d = (a+b).expand(10, 10, 10)
+    e = (a+b).permute(2,1,0)
+    f = d+e
+    check_schedule(f, 2)
+
+  def test_dont_fuse_binops_with_children(self):
+    a = Tensor.empty(10)
+    b = Tensor.empty(10)
+    c = Tensor.empty(10)
+    keep_me = a+b
+    e = keep_me.sum() # give keep_me a child (NOTE: BinaryOps won't be a child since it will instant fuse)
+    d = keep_me+c
+    check_schedule(d, 2)
+    check_schedule(keep_me, 0, [d])
+
+  @unittest.skip("failing in old lazy")
+  def test_permute_breaks_fusion(self):
+    a = Tensor.empty(10, 10, 10)
+    b = Tensor.empty(10, 10)
+    c = (a.sum(axis=2) + b).permute(1,0)
+    d = c.permute(1,0)
+    check_schedule(d, 1)
+
+  def test_some_permute_fusion(self):
+    a = Tensor.empty(8192, 16)
+    b = Tensor.empty(1, 16)
+    d = (a.T + b.expand(8192, 16).T)
+    c = a + b.expand(8192, 16)
+    e = d.T
+    check_schedule(c, 1)
+    check_schedule(e, 1)
+
+  # this is the failing case in openpilot...it's very simple like this
+  @unittest.skip("failing in old lazy")
+  def test_image_conv_fusion(self):
+    from tinygrad.features.image import image_conv2d
+    w1 = Tensor.empty(16, 16, 1, 1)
+    b1 = Tensor.empty(16)
+    w2 = Tensor.empty(16, 16, 1, 1)
+    b2 = Tensor.empty(16)
+    w3 = Tensor.empty(16, 16, 1, 1)
+    b3 = Tensor.empty(16)
+
+    x = Tensor.empty(1, 16, 32, 32)
+    x = base = image_conv2d(x, w1, b1)
+    x = image_conv2d(x, w2, b2) + base
+    x = image_conv2d(x, w3, b3)
+
+    # NOOP, 3 convs, contiguous
+    check_schedule(x, 5)
+
+  def test_image_conv_fusion_minimal(self):
+    b1 = Tensor.empty(16)
+    b2 = Tensor.empty(16)
+    def p(x): return x.permute(1,0).contiguous().reshape(32,16,1).expand(32,16,16).sum(axis=2).permute(1,0)
+
+    x = Tensor.empty(16, 32)
+    x = base = p(x) + b1.reshape(16,1)
+    x = p(x)
+    x = x + b2.reshape(16,1)
+    x = x + base
+    del base
+    x = p(x)
+    check_schedule(x, 4)
+
+  def test_image_conv_fusion_more_minimal(self):
+    b1 = Tensor.empty(16)
+    def p(x): return x.permute(1,0).contiguous().reshape(32,16,1).expand(32,16,16).sum(axis=2).permute(1,0)
+
+    x = Tensor.empty(16, 32)
+    x = base = p(x) + b1.reshape(16,1)
+    x = p(x)
+    del base
+    check_schedule(x, 3)
+
+  def test_resnet_block(self):
+    from models.resnet import BasicBlock
+    Tensor.training = False
+    bb = BasicBlock(64,64)
+
+    x = Tensor.empty(1, 64, 32, 32)
+    out = bb(x)
+    check_schedule(out, 4)
+
+  def test_contiguous_while_contiguous(self):
+    x = Tensor.empty(1, 64, 32, 32)
+    out = x.contiguous()
+    check_schedule(out, 1, filter_loadops=False)
+
+  def test_contiguous_while_not_contiguous(self):
+    x = Tensor.empty(1, 64, 32, 32)
+    out = x.permute(0,2,3,1).contiguous()
+    check_schedule(out, 2, filter_loadops=False)
+
+  def test_double_from(self):
+    x = Tensor([1,2,3,4])
+    out = x.to('cpu')
+    check_schedule(out, 0, filter_loadops=False)
+
+  def test_pow_const_tensor(self):
+    x = Tensor([1,2,3,4])
+    out = x ** Tensor(2)
+    check_schedule(out, 1)
+
+if __name__ == '__main__':
+  unittest.main(verbosity=2)
--- a/tinygrad_repo/test/test_search.py
+++ b/tinygrad_repo/test/test_search.py
@@ -0,0 +1,19 @@
+import unittest
+
+from tinygrad.codegen.linearizer import Linearizer
+from tinygrad.features.search import time_linearizer
+from tinygrad.ops import Compiled, Device, LoadOps
+from tinygrad.tensor import Tensor
+
+class TestTimeLinearizer(unittest.TestCase):
+  def setUp(self) -> None:
+    if not isinstance(Device[Device.DEFAULT], Compiled): raise unittest.SkipTest("only test for compiled backends")
+
+  def test_reasonable_time(self):
+    si = [si for si in Tensor([1,2,3,4]).add(1).lazydata.schedule() if si.ast.op not in LoadOps][0]
+    rawbufs = [Device[Device.DEFAULT].buffer(si.out.st.size(), si.out.dtype)] + [Device[Device.DEFAULT].buffer(x.st.size(), x.dtype) for x in si.inputs]
+    tm = time_linearizer(Linearizer(si.ast), rawbufs, allow_test_size=False, cnt=10)
+    assert tm > 0 and tm != float('inf')
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_specific_conv.py
+++ b/tinygrad_repo/test/test_specific_conv.py
@@ -0,0 +1,57 @@
+import unittest
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import dtypes
+from tinygrad.ops import Device
+import pytest
+# similar to test/external/external_test_gpu_ast.py, but universal
+
+pytestmark = pytest.mark.exclude_cuda
+
+class TestSpecific(unittest.TestCase):
+  # from openpilot
+
+  # 1x1 6 <- 24
+  def test_1x1_6_24(self):
+    x = Tensor.randn(1,   24*4, 32, 64)
+    w = Tensor.randn(6*4, 24*4, 1,  1)
+    x.conv2d(w).permute(0,2,3,1).reshape(32, 384, 4).contiguous().realize()
+
+  def test_vec_mul(self):
+    # this forces it to be an image...
+    x = Tensor.ones(1, 512, 4).contiguous().reshape(1, 2048)
+    w = Tensor.randn(2048, 512)
+    (x @ w).reshape(1, 128, 4).contiguous().realize()
+
+  @unittest.skipIf(Device.DEFAULT in ["LLVM", "WEBGPU"], "Broken on LLVM and webgpu")
+  def test_big_vec_mul(self):
+    # from LLaMA
+    #   0 buffer<4096, dtypes.float>                      [View((1024, 1, 1, 4), (4, 0, 0, 1), 0, None)]
+    #   1 buffer<4096, dtypes.float>                      [View((1024, 1024, 4, 4), (0, 4, 1, 0), 0, None)]
+    #   2 buffer<16777216, dtypes.half>                   [View((1024, 1024, 4, 4), (16384, 4, 1, 4096), 0, None)]
+    x = Tensor.randn(4096).realize()
+    w = Tensor.randn(4096, 4096, device='cpu').cast(dtypes.float16).to(Device.DEFAULT).realize()
+    (x @ w.T).realize()
+
+  # from https://dl.acm.org/doi/pdf/10.1145/3495243.3517020
+
+  # ~260 GFLOPS on Adreno 640, should be 260*(720/890)*(596/710) = 176.5 on downclocked 630
+  # we get 170
+  def test_1x1_28_28(self):
+    x = Tensor.randn(1,   256, 28, 28)
+    w = Tensor.randn(256, 256, 1,  1)
+    x.conv2d(w).permute(0,2,3,1).reshape(28, 28*256//4, 4).contiguous().realize()
+
+  # 132 GFLOPS on Adreno 640, should be 132*(720/890)*(596/710) = 90 on downclocked 630
+  # gets 54 with broken opt, 74 without opt, and 146 if we pad and opt 3!
+  def test_3x3_28_28_stride_2(self):
+    x = Tensor.randn(1,   288, 36, 36)
+    w = Tensor.randn(384, 288, 3,  3)
+    x.conv2d(w, stride=2).permute(0,2,3,1).reshape(17, 17*384//4, 4).contiguous().realize()
+
+  def test_3x3_28_28_stride_2_padded(self):
+    x = Tensor.randn(1,   288, 36, 36)
+    w = Tensor.randn(384, 288, 3,  3)
+    x.conv2d(w, stride=2, padding=1).permute(0,2,3,1).reshape(18, 18*384//4, 4).contiguous().realize()
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_speed_v_torch.py
+++ b/tinygrad_repo/test/test_speed_v_torch.py
@@ -0,0 +1,288 @@
+import os
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+os.environ["MKL_NUM_THREADS"] = "1"
+os.environ["NUMEXPR_NUM_THREADS"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
+import unittest
+import torch
+torch.set_num_threads(1)
+import time
+import numpy as np
+np.set_printoptions(linewidth=160)
+from tinygrad.ops import Device
+from tinygrad.helpers import GlobalCounters
+from tinygrad.tensor import Tensor
+from tinygrad.nn import Conv2d
+from tinygrad.helpers import colored, getenv, CI
+from tinygrad.jit import TinyJit
+import pytest
+
+pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
+
+IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
+
+torch_dt = torch.float16 if getenv("HALF", 0) else torch.float32
+torch_device = torch.device('mps' if getenv("MPS", 0) else ('cuda' if getenv("TORCHCUDA", 0) else 'cpu'))
+if str(torch_device) == "mps":
+  import torch.mps
+  sync = lambda: torch.mps.synchronize()
+elif str(torch_device) == "cuda":
+  import torch.cuda
+  sync = lambda: torch.cuda.synchronize()
+else:
+  sync = lambda: None
+
+def colorize_float(x):
+  ret = f"{x:7.2f}x"
+  if x < 0.75:
+    return colored(ret, 'green')
+  elif x > 1.15:
+    return colored(ret, 'red')
+  else:
+    return colored(ret, 'yellow')
+
+save_ops, save_mem = 0, 0
+CNT = getenv("CNT", 8)
+def helper_test_speed(f1, *args):
+  global save_ops, save_mem
+  ets = []
+  ret = None
+  cache_defeat = np.zeros((2048,2048))
+  for i in range(CNT):
+    del ret
+
+    # operation cache defeats
+    args = [(x+1).realize() if isinstance(x, Tensor) else (None if x is None else (x+1)) for x in args]
+
+    # force syncing
+    [x.numpy() if isinstance(x, Tensor) or str(torch_device) == "cpu" else x.cpu().numpy() for x in args if x is not None]
+
+    # clear 32MB global memory cache (CPU and global memory only)
+    cache_defeat += 1
+
+    # manual pre sync
+    if isinstance(args[0], Tensor): Device[args[0].device].synchronize()
+    else: sync()
+
+    GlobalCounters.global_ops = 0
+    GlobalCounters.global_mem = 0
+    st = time.perf_counter()
+    ret = f1(*args)
+    if isinstance(ret, Tensor): Device[ret.device].synchronize()
+    else: sync()
+    et = (time.perf_counter() - st) * 1000
+    if i >= 1: ets.append(et)
+    if GlobalCounters.global_ops:
+      save_ops, save_mem = GlobalCounters.global_ops, GlobalCounters.global_mem
+  return ret.numpy() if isinstance(ret, Tensor) else ret.cpu().numpy(), np.min(ets)
+
+def helper_test_generic_square(name, N, f1, f2, onearg=False):
+  torch.manual_seed(0)
+  torch_a = (torch.rand(N, N, dtype=torch_dt) - 0.5).to(torch_device)
+  torch_b = (torch.rand(N, N, dtype=torch_dt) - 0.5).to(torch_device) if not onearg else None
+
+  tiny_a = Tensor(torch_a.cpu().numpy())
+  tiny_b = Tensor(torch_b.cpu().numpy()) if not onearg else None
+
+  helper_test_generic(f"{name:30s} {N:5d}x{N:5d}", f1, (torch_a, torch_b), TinyJit(lambda a,b:f2(a,b).realize()), (tiny_a, tiny_b))
+
+def helper_test_matvec(name, N, M):
+  torch.manual_seed(0)
+  torch_a = (torch.rand(N, dtype=torch_dt) - 0.5).to(torch_device)
+  torch_b = (torch.rand(N, M, dtype=torch_dt) - 0.5).to(torch_device)
+
+  tiny_a = Tensor(torch_a.cpu().numpy())
+  tiny_b = Tensor(torch_b.cpu().numpy())
+
+  helper_test_generic(f"{name:30s} {N:5d}x{M:5d}", lambda a,b: a@b, (torch_a, torch_b), TinyJit(lambda a,b:(a@b).realize()), (tiny_a, tiny_b))
+
+prefix = None
+def helper_test_generic(name, f1, f1_args, f2, f2_args):
+  global prefix
+  with torch.no_grad():
+    val_torch, et_torch = helper_test_speed(f1, *f1_args)
+  val_tinygrad, et_tinygrad = helper_test_speed(f2, *f2_args)
+
+  desc = "faster" if et_torch > et_tinygrad else "slower"
+  flops = save_ops*1e-6
+  mem = save_mem*1e-6
+  print(("\r" if not CI else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:8.2f} GFLOPS {mem/et_torch:8.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:8.2f} GFLOPS {mem/et_tinygrad:8.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")
+  np.testing.assert_allclose(val_tinygrad, val_torch, atol=1e-3, rtol=1e-3)
+
+def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_x):
+  torch.manual_seed(0)
+  torch_dat = torch.rand(bs, in_chans, img_size_y, img_size_x, dtype=torch_dt).to(torch_device)
+  torch_conv = torch.nn.Conv2d(in_chans, out_chans, kernel_size, bias=None, dtype=torch_dt).to(torch_device)
+
+  tiny_dat = Tensor(torch_dat.cpu().numpy())
+  tiny_conv = Conv2d(in_chans, out_chans, kernel_size, bias=None)
+  tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
+
+  def f1(torch_dat): return torch_conv(torch_dat)
+  def f2(tiny_dat): return tiny_conv(tiny_dat).realize()
+  helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
+
+@unittest.skipIf(getenv("BIG") == 0, "no big tests")
+class TestBigSpeed(unittest.TestCase):
+  def test_add(self):
+    def f(a, b): return a+b
+    helper_test_generic_square('add', 8192, f, f)
+  def test_exp(self):
+    def f(a, b): return a.exp()
+    helper_test_generic_square('exp', 8192, f, f, onearg=True)
+  def test_gemm_2048(self):
+    def f(a, b): return a @ b
+    helper_test_generic_square('gemm', 2048, f, f)
+  def test_gemm_4096(self):
+    def f(a, b): return a @ b
+    helper_test_generic_square('gemm', 4096, f, f)
+  def test_large_conv_1x1(self): helper_test_conv(bs=32, in_chans=128, out_chans=128, kernel_size=1, img_size_y=128, img_size_x=128)
+  def test_large_conv_3x3(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=3, img_size_y=130, img_size_x=130)
+  def test_large_conv_5x5(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=5, img_size_y=132, img_size_x=132)
+  def test_matvec_4096_16384(self): helper_test_matvec('matvec_4096_16384', 4096, 16384)
+  def test_matvec_16384_4096(self): helper_test_matvec('matvec_16384_4096', 16384, 4096)
+
+@unittest.skipIf(getenv("BIG") == 1, "only big tests")
+class TestSpeed(unittest.TestCase):
+  def test_sub(self):
+    def f(a, b): return a-b
+    helper_test_generic_square('sub', 4096, f, f)
+
+  @unittest.skipIf(CI and Device.DEFAULT == "WEBGPU", "breaking on webgpu CI")
+  def test_pow(self):
+    def f(a, b): return a.pow(b)
+    helper_test_generic_square('pow', 2048, f, f)
+
+  def test_sum(self):
+    def f(a, b): return a.sum()
+    helper_test_generic_square('sum', 2048, f, f, onearg=True)
+    helper_test_generic_square('sum', 4096, f, f, onearg=True)
+
+  def test_partial_sum(self):
+    R = 256
+    def f(a, b): return a.reshape(int(4096//R), int(4096*R)).sum(axis=1)
+    helper_test_generic_square('partial_sum', 4096, f, f, onearg=True)
+
+  @unittest.skip("not really used in models")
+  def test_cumsum(self):
+    def f0(a, b): return a.cumsum(axis=0)
+    def f1(a, b): return a.cumsum(axis=1)
+    helper_test_generic_square('cumsum_0', 256, f0, f0, onearg=True)
+    helper_test_generic_square('cumsum_1', 256, f1, f1, onearg=True)
+
+  def test_cat(self):
+    helper_test_generic_square('cat_0', 256, lambda x,y: torch.cat((x,y),dim=0), lambda x,y: x.cat(y,dim=0))
+    helper_test_generic_square('cat_1', 256, lambda x,y: torch.cat((x,y),dim=1), lambda x,y: x.cat(y,dim=1))
+
+  def test_array_packing(self):
+    N = 2048
+    def f(a, b): return a.reshape(N, N // 32, 32).permute(1,0,2).contiguous()
+    helper_test_generic_square('array_packing', N, f, f, onearg=True)
+
+  def test_permute(self):
+    for N in [1024, 4096]:
+      # this is a 64MB tensor, M1 L1 cache is 128kB
+      # to fit easily in L1, rotations should be 128x128 chunks. 128x128 is also the AMX size
+      def f(a, b): return a.permute(1,0).contiguous()
+      helper_test_generic_square('permute', N, f, f, onearg=True)
+
+  def test_double_permute(self):
+    N = 64
+    torch.manual_seed(0)
+    torch_a = (torch.rand(N, N, N, N, dtype=torch_dt) - 0.5).to(torch_device)
+    tiny_a = Tensor(torch_a.cpu().numpy())
+    def f(a): return a.permute(1,0,3,2).contiguous()
+    helper_test_generic(f"double_permute {tiny_a.shape}", f, (torch_a,), TinyJit(lambda a: f(a).realize()), (tiny_a,))
+
+  def test_neg(self):
+    def f(a, b): return -a
+    helper_test_generic_square('neg', 4096, f, f, onearg=True)
+
+  def test_exp(self):
+    def f(a, b): return a.exp()
+    helper_test_generic_square('exp', 2048, f, f, onearg=True)
+
+  def test_relu(self):
+    def f(a, b): return a.relu()
+    helper_test_generic_square('relu', 4096, f, f, onearg=True)
+
+  def test_max(self):
+    def f(a, b): return a.max()
+    helper_test_generic_square('max', 4096, f, f, onearg=True)
+
+  def test_mul_sum(self):
+    def f(a, b): return (a*b).sum()
+    helper_test_generic_square('mul_sum', 4096, f, f)
+
+  def test_add(self):
+    for N in [1, 1024, 4096]:
+      def f(a, b): return a + b
+      helper_test_generic_square('add', N, f, f)
+
+  def test_add_constant(self):
+    def f(a, b): return a+2.0
+    helper_test_generic_square('add_constant', 4096, f, f, onearg=True)
+
+  def test_add_sq(self):
+    def f(a, b): return a*a + b*b
+    helper_test_generic_square('add_sq', 4096, f, f)
+
+  def test_gemm(self):
+    def f(a, b): return a @ b
+    helper_test_generic_square('gemm', 1024, f, f)
+
+  def test_gemm_small(self):
+    def f(a, b): return a @ b
+    helper_test_generic_square('gemm', 256, f, f)
+
+  def test_gemm_unrolled(self):
+    N = 512
+    def f1(a, b): return a@b.T
+    def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
+    helper_test_generic_square('gemm_unrolled', N, f1, f2)
+
+  def test_gemm_unrolled_permute_l(self):
+    N = 512
+    def f1(a, b): return a.T@b.T
+    def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
+    helper_test_generic_square('gemm_unrolled_permute_l', N, f1, f2)
+
+  def test_gemm_unrolled_permute_r(self):
+    N = 512
+    def f1(a, b): return a@b
+    def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
+    helper_test_generic_square('gemm_unrolled_permute_r', N, f1, f2)
+
+  def test_gemm_unrolled_permute_lr(self):
+    N = 512
+    def f1(a, b): return a.T@b
+    def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
+    helper_test_generic_square('gemm_unrolled_permute_lr', N, f1, f2)
+
+  def test_matvec_1024_1024(self): helper_test_matvec('matvec_1024_1024', 1024, 1024)
+  def test_matvec_1024_4096(self): helper_test_matvec('matvec_1024_4096', 1024, 4096)
+  def test_matvec_4096_1024(self): helper_test_matvec('matvec_4096_1024', 4096, 1024)
+  def test_matvec_4096_4096(self): helper_test_matvec('matvec_4096_4096', 4096, 4096)
+
+  def test_openpilot_conv2d(self):
+    bs, in_chans, out_chans = 1,12,32
+    torch.manual_seed(0)
+    torch_dat = torch.rand(bs, 64, 128, 12, dtype=torch_dt).to(torch_device)
+    torch_conv = torch.nn.Conv2d(in_chans, out_chans, 3, bias=None, padding=1, dtype=torch_dt).to(torch_device)
+
+    tiny_dat = Tensor(torch_dat.cpu().numpy())
+    tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
+    tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
+
+    def f1(torch_dat): return torch_conv(torch_dat.permute(0,3,1,2))
+    def f2(tiny_dat): return tiny_conv(tiny_dat.permute(0,3,1,2)).realize()
+    helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:3", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
+
+  def test_conv2d(self):
+    for bs in [32]:
+      for in_chans in IN_CHANS:
+        for out_chans in [32]:
+          helper_test_conv(bs, in_chans, out_chans, 3, 34, 34)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_symbolic_jit.py
+++ b/tinygrad_repo/test/test_symbolic_jit.py
@@ -0,0 +1,181 @@
+import unittest
+from tinygrad.jit import TinyJit
+from tinygrad.helpers import getenv
+from tinygrad.shape.symbolic import Variable
+from tinygrad.tensor import Tensor, Device
+import numpy as np
+
+@unittest.skipIf(getenv("ARM64") or getenv("PTX"), "ARM64 and PTX are not supported")
+@unittest.skipUnless(Device.DEFAULT in ["GPU", "METAL", "CLANG", "CUDA", "LLVM"], f"{Device.DEFAULT} is not supported")
+class TestSymbolicJit(unittest.TestCase):
+  def test_plus1(self):
+    def f(a): return (a+1).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      symbolic = jf(a.reshape(3, vi)).reshape(3, i).numpy()
+      expected = f(a).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_reshape_inside_plus1(self):
+    def f(a, jit=False, jit_ctx=None):
+      if jit: a = a.reshape(3, Variable("i", 1, 10).bind(a.shape[1]))
+      return (a+1).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10)
+      a = Tensor.rand(3, i)
+      symbolic = jf(a, jit=True, jit_ctx={vi: i}).reshape(3, i).numpy()
+      expected = f(a).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_add(self):
+    def f(a, b): return (a+b).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      b = Tensor.rand(3, i)
+      symbolic = jf(a.reshape(3, vi), b.reshape(3, vi)).reshape(3, i).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_matmul(self):
+    def f(a, b): return (a@b).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      b = Tensor.rand(i, 5)
+      symbolic = jf(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_mixed_with_no_symbol_kernel(self):
+    def f(a, b):
+      s = (a@b).realize()
+      s = (s+s).realize() # this one does not have symbols in input
+      return s
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      b = Tensor.rand(i, 5)
+      symbolic = jf(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 2
+
+  def test_attention(self):
+    def f(q, k, v): return Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      q = Tensor.rand(2, 1, 4, 8)
+      k = Tensor.rand(2, i, 4, 8)
+      v = Tensor.rand(2, i, 4, 8)
+      symbolic = jf(q, k.reshape(2, vi, 4, 8), v.reshape(2, vi, 4, 8)).reshape(2, 4, 1, 8).numpy()
+      expected = f(q, k, v).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 6
+
+  def test_cat_dim0(self):
+    def f(a, b): return a.cat(b, dim=0).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(i, 3)
+      b = Tensor.rand(2, 3)
+      symbolic = jf(a.reshape(vi, 3), b).reshape(i+2, 3).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_cat_dim1(self):
+    def f(a, b): return a.cat(b, dim=1).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      b = Tensor.rand(3, 2)
+      symbolic = jf(a.reshape(3, vi), b).reshape(3, i+2).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_cat_dim0_two_vars(self):
+    def f(a, b): return a.cat(b, dim=0).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      for j in range(1, 5):
+        vi = Variable("i", 1, 10).bind(i)
+        vj = Variable("j", 1, 10).bind(j)
+        a = Tensor.rand(i, 3)
+        b = Tensor.rand(j, 3)
+        symbolic = jf(a.reshape(vi, 3), b.reshape(vj, 3)).reshape(i+j, 3).numpy()
+        expected = f(a, b).numpy()
+        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_cat_dim1_two_vars(self):
+    def f(a, b): return a.cat(b, dim=1).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      for j in range(1, 5):
+        vi = Variable("i", 1, 10).bind(i)
+        vj = Variable("j", 1, 10).bind(j)
+        a = Tensor.rand(3, i)
+        b = Tensor.rand(3, j)
+        symbolic = jf(a.reshape(3, vi), b.reshape(3, vj)).reshape(3, i+j).numpy()
+        expected = f(a, b).numpy()
+        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_two_vars_plus1(self):
+    def f(a, b): return (a@b+1).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      for j in range(1, 5):
+        vi = Variable("i", 1, 10).bind(i)
+        vj = Variable("j", 1, 10).bind(j)
+        a = Tensor.rand(i, 3)
+        b = Tensor.rand(3, j)
+        symbolic = jf(a.reshape(vi, 3), b.reshape(3, vj)).reshape(i, j).numpy()
+        expected = f(a, b).numpy()
+        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+  def test_jit_symbolic_shape_mismatch(self):
+    @TinyJit
+    def add(a, b): return (a+b).realize()
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i).reshape(3, vi)
+      b = Tensor.rand(3, i).reshape(3, vi)
+      c = add(a, b)
+    vi2 = Variable("i", 1, 10).bind(7)
+    a = Tensor.rand(3, 7).reshape(3, vi2)
+    bad = Tensor.rand(4, 7).reshape(4, vi2)
+    with self.assertRaises(AssertionError):
+      add(a, bad)
+
+  def test_shrink(self):
+    # shrink is a movement, so we pair it with a simple function to test the JIT interaction
+    def f(a): return (a+1).realize()
+    jf = TinyJit(f)
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(7, 11)
+      symbolic = a.shrink(((3,5),(vi,vi+2)))
+      symbolic = jf(symbolic).numpy()
+      expected = f(a.shrink(((3,5),(i,i+2)))).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+    assert len(jf.jit_cache) == 1
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_symbolic_ops.py
+++ b/tinygrad_repo/test/test_symbolic_ops.py
@@ -0,0 +1,124 @@
+import unittest
+from tinygrad.jit import JIT_SUPPORTED_DEVICE
+from tinygrad.shape.symbolic import Variable
+from tinygrad.helpers import getenv
+from tinygrad.tensor import Tensor, Device
+import numpy as np
+
+@unittest.skipIf(getenv("ARM64") or getenv("PTX"), "ARM64 and PTX are not supported")
+@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["HIP", "WEBGPU"], f"{Device.DEFAULT} is not supported")
+class TestSymbolicOps(unittest.TestCase):
+  def test_plus1(self):
+    def f(a): return (a+1).realize()
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      symbolic = f(a.reshape(3, vi)).reshape(3, i).numpy()
+      expected = f(a).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_add(self):
+    def f(a, b): return (a+b).realize()
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      b = Tensor.rand(3, i)
+      symbolic = f(a.reshape(3, vi), b.reshape(3, vi)).reshape(3, i).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_matmul(self):
+    def f(a, b): return (a@b).realize()
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      b = Tensor.rand(i, 5)
+      symbolic = f(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_attention(self, dropout_p=0.0):
+    def f(q, k, v): return Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p).realize()
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      q = Tensor.rand(2, 1, 4, 8)
+      k = Tensor.rand(2, i, 4, 8)
+      v = Tensor.rand(2, i, 4, 8)
+      symbolic = f(q, k.reshape(2, vi, 4, 8), v.reshape(2, vi, 4, 8)).reshape(2, 4, 1, 8).numpy()
+      expected = f(q, k, v).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_attention_training(self):
+    with Tensor.train():
+      self.test_attention(dropout_p=0.0)
+      with self.assertRaises(AssertionError):
+        # symbolic shape dropout is not supported
+        self.test_attention(dropout_p=0.5)
+
+  def test_cat_dim0(self):
+    def f(a, b): return a.cat(b, dim=0).realize()
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(i, 3)
+      b = Tensor.rand(2, 3)
+      symbolic = f(a.reshape(vi, 3), b).reshape(i+2, 3).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_cat_dim1(self):
+    def f(a, b): return a.cat(b, dim=1).realize()
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(3, i)
+      b = Tensor.rand(3, 2)
+      symbolic = f(a.reshape(3, vi), b).reshape(3, i+2).numpy()
+      expected = f(a, b).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_cat_dim0_two_vars(self):
+    def f(a, b): return a.cat(b, dim=0).realize()
+    for i in range(1, 5):
+      for j in range(1, 5):
+        vi = Variable("i", 1, 10).bind(i)
+        vj = Variable("j", 1, 10).bind(j)
+        a = Tensor.rand(i, 3)
+        b = Tensor.rand(j, 3)
+        symbolic = f(a.reshape(vi, 3), b.reshape(vj, 3)).reshape(i+j, 3).numpy()
+        expected = f(a, b).numpy()
+        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_cat_dim1_two_vars(self):
+    def f(a, b): return a.cat(b, dim=1).realize()
+    for i in range(1, 5):
+      for j in range(1, 5):
+        vi = Variable("i", 1, 10).bind(i)
+        vj = Variable("j", 1, 10).bind(j)
+        a = Tensor.rand(3, i)
+        b = Tensor.rand(3, j)
+        symbolic = f(a.reshape(3, vi), b.reshape(3, vj)).reshape(3, i+j).numpy()
+        expected = f(a, b).numpy()
+        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_two_vars_plus1(self):
+    def f(a, b): return (a@b+1).realize()
+    for i in range(1, 5):
+      for j in range(1, 5):
+        vi = Variable("i", 1, 10).bind(i)
+        vj = Variable("j", 1, 10).bind(j)
+        a = Tensor.rand(i, 3)
+        b = Tensor.rand(3, j)
+        symbolic = f(a.reshape(vi, 3), b.reshape(3, vj)).reshape(i, j).numpy()
+        expected = f(a, b).numpy()
+        np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+  def test_shrink(self):
+    for i in range(1, 5):
+      vi = Variable("i", 1, 10).bind(i)
+      a = Tensor.rand(7, 11)
+      symbolic = a.shrink(((3,5),(vi,vi+2)))
+      symbolic = symbolic.numpy()
+      expected = a.shrink(((3,5),(i,i+2))).numpy()
+      np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_symbolic_shapetracker.py
+++ b/tinygrad_repo/test/test_symbolic_shapetracker.py
@@ -0,0 +1,173 @@
+import unittest
+from tinygrad.shape.shapetracker import ShapeTracker, View
+from tinygrad.shape.symbolic import Variable
+from tinygrad.tensor import Tensor
+
+class TestSymbolic(unittest.TestCase):
+  def test_symbolic_st(self):
+    x = Variable("x", 1, 100)
+    st = ShapeTracker.from_shape((x, 3))
+    assert st.shape == (x, 3)
+    assert st.real_strides() == (3, 1)
+
+  def test_expr_idxs(self):
+    x = Variable("x", 1, 100)
+    st = ShapeTracker.from_shape((x, 3))
+    idxs = [Variable("x", 0, 100), Variable("y", 0, 100)]
+    e1, e2 = st.expr_idxs(idxs)
+    assert e1.render() == "((x*3)+y)"
+    assert e2.render() == "1"
+    st = st.permute((1, 0))
+    e1, e2 = st.expr_idxs(idxs)
+    assert e1.render() == "((y*3)+x)"
+    assert e2.render() == "1"
+
+  def test_cat_dim0_strides(self):
+    i = Variable("i", 1, 5).bind(3)
+    j = Variable("j", 1, 5).bind(3)
+    k = Variable("k", 1, 5).bind(3)
+    t = Tensor.rand(3, 4).reshape(i, 4).cat(Tensor.rand(3, 4).reshape(j, 4), dim=0).cat(Tensor.rand(3, 4).reshape(k, 4), dim=0)
+    st = t.lazydata.st
+    assert st.shape == (i+j+k, 4)
+    assert st.real_strides() == (4, 1)
+    t = Tensor.rand(3, 3).reshape(i, 3).cat(Tensor.rand(3, 3).reshape(i, 3), dim=0).cat(Tensor.rand(3, 3), dim=0)
+    st = t.lazydata.st
+    assert st.shape == (2*i+3, 3)
+    assert st.real_strides() == (3, 1)
+
+  def test_cat_dim1_strides(self):
+    i = Variable("i", 1, 5).bind(4)
+    j = Variable("j", 1, 5).bind(4)
+    k = Variable("k", 1, 5).bind(4)
+    t = Tensor.rand(3, 4).reshape(3, i).cat(Tensor.rand(3, 4).reshape(3, j), dim=1).cat(Tensor.rand(3, 4).reshape(3, k), dim=1)
+    st = t.lazydata.st
+    assert st.shape == (3, i+j+k)
+    assert st.real_strides() == (i+j+k, 1)
+
+class TestSymbolicVarVals(unittest.TestCase):
+  def test_var_vals_empty(self):
+    assert ShapeTracker.from_shape((3, 4, 5)).var_vals == {}
+
+  def test_var_vals_shape(self):
+    x = Variable("x", 1, 100).bind(3)
+    assert ShapeTracker.from_shape((x, 3)).var_vals == {Variable("x", 1, 100): 3}
+
+  def test_var_vals_offset(self):
+    x = Variable("x", 1, 100).bind(3)
+    st = ShapeTracker.from_shape((4, 3)).shrink(((x, x+1), (0, 3)))
+    assert st.real_offset() == x * 3
+    assert st.var_vals == {Variable("x", 1, 100): 3}
+
+  def test_var_vals_mask(self):
+    x = Variable("x", 1, 100).bind(3)
+    view = View.create(shape=(3,4), strides=(4,1), offset=0, mask=((0, x), (0, 4)))
+    st = ShapeTracker(views=(view,))
+    assert st.var_vals == {Variable("x", 1, 100): 3}
+
+  def test_var_vals_complex(self):
+    x = Variable("x", 1, 100).bind(3)
+    y = Variable("y", 1, 100).bind(4)
+    z = Variable("z", 1, 100).bind(5)
+    st = ShapeTracker.from_shape((x, 5, y)).shrink(((0, x), (z, z+1), (0, 3)))
+    assert st.real_offset() == y * z
+    assert st.var_vals == {Variable("x", 1, 100): 3, Variable("y", 1, 100):4, Variable("z", 1, 100): 5}
+
+  def test_shrink_reshape(self):
+    x = Variable("x", 1, 100).bind(3)
+    st = ShapeTracker.from_shape((10, 10, 10)).shrink(((x, x+3), (3, 7), (2, 5)))
+    st = st.reshape((3*4*3,))
+    assert st.var_vals == {Variable("x", 1, 100): 3}
+
+class TestShapeTrackerUnbind(unittest.TestCase):
+  def test_view_unbind(self):
+    v = Variable("v", 1, 100)
+    bv = Variable("v", 1, 100).bind(3)
+    assert View.create(shape=(bv, 4)).unbind() == View.create(shape=(v, 4))
+
+  def test_reshape_unbind(self):
+    v = Variable("v", 1, 100)
+    bv = Variable("v", 1, 100).bind(3)
+    t = Tensor.rand(3, 4).reshape(bv, 4)
+    assert t.lazydata.st.unbind() == ShapeTracker((View.create(shape=(v, 4)),))
+
+  def test_shrink_unbind(self):
+    v = Variable("v", 1, 100)
+    bv = Variable("v", 1, 100).bind(2)
+    t = Tensor.rand(3, 4).shrink(((bv, bv+1), (0, 4)))
+    assert t.lazydata.st.unbind() == ShapeTracker((View.create(shape=(1, 4), offset=4*v),))
+
+class TestSymbolicReshape(unittest.TestCase):
+  def test_reshape_into_symbols_simple(self):
+    for i in range(1, 6):
+      vi = Variable("i", 1, 5).bind(i)
+      t = Tensor.rand(i, 4).reshape(vi, 4)
+      assert t.shape == (vi, 4)
+      t = Tensor.rand(i, 6).reshape(vi, 2, 3)
+      assert t.shape == (vi, 2, 3)
+
+  def test_reshape_symbols_reshape_ints(self):
+    for i in range(1, 6):
+      vi = Variable("i", 1, 5).bind(i)
+      t = Tensor.rand(i, 4).reshape(vi, 4)
+      assert t.shape == (vi, 4)
+      t = t.reshape(i, 4)
+      assert t.shape == (i, 4)
+
+  def test_reshape_into_symbols_bad_shape(self):
+    vi = Variable("i", 1, 10).bind(4)
+    with self.assertRaises(AssertionError):
+      t = Tensor.rand(4, 6).reshape(vi, 6).reshape(1, 77) # reshape to a different size new shape through symbolic shape
+    with self.assertRaises(AssertionError):
+      t = Tensor.rand(3, 4).reshape(3, (vi+1)) # reshape into non-Variable Node
+
+  def test_two_symbol_reshape(self):
+    for i in range(1, 6):
+      for j in range(1, 6):
+        vi = Variable("i", 1, 5).bind(i)
+        vj = Variable("j", 1, 5).bind(j)
+        t = Tensor.rand(i, j).reshape(vi, vj)
+        assert t.shape == (vi, vj)
+        # NOTE: this is currently not allowed
+        # t = t.reshape(1, vi*vj)
+        # assert t.shape == (1, vi*vj)
+        t = t.reshape(vj, vi)
+        assert t.shape == (vj, vi)
+
+class TestSymbolicExpand(unittest.TestCase):
+  def test_expand_into_symbols(self):
+    # TODO: enfore expand only into bound variables
+    vi = Variable("i", 1, 5)
+    vj = Variable("j", 1, 5)
+    a = Tensor([[1], [2], [3]]).expand((3, vi))
+    assert a.shape == (3, vi)
+    a = a.reshape(3, vi, 1).expand((3, vi, vj))
+    assert a.shape == (3, vi, vj)
+
+  def test_plus_expands_constant(self):
+    for i in range(1, 6):
+      vi = Variable("i", 1, 5).bind(i)
+      a = Tensor.rand(3, i).reshape(3, vi)
+      a = a + 1
+      assert a.shape == (3, vi)
+
+class TestSymbolicShrink(unittest.TestCase):
+  def test_shrink_symbols(self):
+    vi = Variable("i", 1, 5)
+    t = Tensor.rand(3, 5).shrink(((0, 2), (vi, vi+1)))
+    assert t.shape == (2, 1)
+
+class TestSymbolicShapeExpr(unittest.TestCase):
+  def test_symbolic_expr_idxs(self):
+    # taken from symbolic shape llama
+    i = Variable("i", 1, 120)
+    gidx0 = Variable("gidx0", 0, i)
+    lidx1 = Variable("lidx1", 0, 7)
+    idx = (gidx0, lidx1, Variable.num(1))
+    shape = (i+1, 8, 4)
+    strides = (1, (i*4)+4, i+1)
+    st = ShapeTracker((View.create(shape, strides), ))
+    idx, valid = st.expr_idxs(idx)
+    assert idx.render() == "((lidx1*((i*4)+4))+1+gidx0+i)"
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_tensor.py
+++ b/tinygrad_repo/test/test_tensor.py
@@ -0,0 +1,266 @@
+import numpy as np
+import torch
+import struct
+import unittest, copy
+import mmap
+from tinygrad.tensor import Tensor, Device
+from tinygrad.helpers import dtypes
+from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
+from extra.utils import temp
+
+x_init = np.random.randn(1,3).astype(np.float32)
+U_init = np.random.randn(3,3).astype(np.float32)
+V_init = np.random.randn(3,3).astype(np.float32)
+W_init = np.random.randn(3,3).astype(np.float32)
+m_init = np.random.randn(1,3).astype(np.float32)
+
+class TestTinygrad(unittest.TestCase):
+  def test_zerodim_initialization(self):
+    a = Tensor(55)
+    b = Tensor(3.14)
+
+    self.assertEqual(a.shape, ())
+    self.assertEqual(b.shape, ())
+
+  def test_plus_equals(self):
+    a = Tensor.randn(10,10)
+    b = Tensor.randn(10,10)
+    c = a + b
+    val1 = c.numpy()
+    a += b
+    val2 = a.numpy()
+    np.testing.assert_allclose(val1, val2)
+
+  def test_backward_pass(self):
+    def test_tinygrad():
+      x = Tensor(x_init, requires_grad=True)
+      W = Tensor(W_init, requires_grad=True)
+      m = Tensor(m_init)
+      out = x.dot(W).relu()
+      out = out.log_softmax()
+      out = out.mul(m).add(m).sum()
+      out.backward()
+      return out.numpy(), x.grad.numpy(), W.grad.numpy()
+
+    def test_pytorch():
+      x = torch.tensor(x_init, requires_grad=True)
+      W = torch.tensor(W_init, requires_grad=True)
+      m = torch.tensor(m_init)
+      out = x.matmul(W).relu()
+      out = torch.nn.functional.log_softmax(out, dim=1)
+      out = out.mul(m).add(m).sum()
+      out.backward()
+      return out.detach().numpy(), x.grad, W.grad
+
+    for x,y in zip(test_tinygrad(), test_pytorch()):
+      np.testing.assert_allclose(x, y, atol=1e-5)
+
+  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "this test uses more than 8 bufs which breaks webgpu") #TODO: remove after #1461
+  def test_backward_pass_diamond_model(self):
+    def test_tinygrad():
+      u = Tensor(U_init, requires_grad=True)
+      v = Tensor(V_init, requires_grad=True)
+      w = Tensor(W_init, requires_grad=True)
+      x = u.mul(v).relu()
+      y = u.mul(w).relu()
+      out = x.add(y).mul(y).relu()
+      out = out.log_softmax()
+      out = out.sum()
+      out.backward()
+      return out.numpy(), u.grad.numpy(), v.grad.numpy(), w.grad.numpy()
+
+    def test_pytorch():
+      u = torch.tensor(U_init, requires_grad=True)
+      v = torch.tensor(V_init, requires_grad=True)
+      w = torch.tensor(W_init, requires_grad=True)
+      x = u.mul(v).relu()
+      y = u.mul(w).relu()
+      out = x.add(y).mul(y).relu()
+      out = torch.nn.functional.log_softmax(out, dim=1)
+      out = out.sum()
+      out.backward()
+      return out.detach().numpy(), u.grad, v.grad, w.grad
+
+    for x,y in zip(test_tinygrad(), test_pytorch()):
+      np.testing.assert_allclose(x, y, atol=1e-5)
+
+  def test_nograd(self):
+    x = Tensor(x_init, requires_grad=False)
+    m = Tensor(m_init, requires_grad=False)
+    W = Tensor(W_init, requires_grad=True)
+    tmp = x.mul(m)
+    mm = tmp.matmul(W)
+    out = mm.relu()
+    out = out.sum()
+    out.backward()
+    assert x.grad is None
+    assert m.grad is None
+    assert tmp.grad is None
+    assert mm.grad is not None
+    assert W.grad is not None
+
+  def test_dropout(self):
+    with Tensor.train():
+      n, rate = 1_000_000, 0.1
+      w = Tensor.ones(n).dropout(rate)
+      non_zeros = np.count_nonzero(w.numpy())
+      expected = n * (1 - rate)
+      np.testing.assert_allclose(non_zeros, expected, rtol=2e-3)
+
+  def test_jacobian(self):
+    W = np.random.RandomState(42069).random((10, 5)).astype(np.float32)
+    x = np.random.RandomState(69420).random((1, 10)).astype(np.float32)
+
+    torch_x = torch.tensor(x, requires_grad=True)
+    torch_W = torch.tensor(W, requires_grad=True)
+    torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
+    PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()
+
+    tiny_x = Tensor(x, requires_grad=True)
+    tiny_W = Tensor(W, requires_grad=True)
+    tiny_func = lambda x: x.dot(tiny_W).relu().log_softmax()
+    J = jacobian(tiny_func, tiny_x)
+    NJ = numerical_jacobian(tiny_func, tiny_x)
+
+    np.testing.assert_allclose(PJ, J, atol = 1e-5)
+    np.testing.assert_allclose(PJ, NJ, atol = 1e-3)
+
+  def test_gradcheck(self):
+    W = np.random.RandomState(1337).random((10, 5)).astype(np.float32)
+    x = np.random.RandomState(7331).random((1, 10)).astype(np.float32)
+
+    tiny_x = Tensor(x, requires_grad=True)
+    tiny_W = Tensor(W, requires_grad=True)
+    tiny_func = lambda x: x.dot(tiny_W).relu().log_softmax()
+
+    self.assertTrue(gradcheck(tiny_func, tiny_x, eps = 1e-3))
+
+    # coarse approx. since a "big" eps and the non-linearities of the model
+    self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 1e-5))
+
+  def test_random_fns_are_deterministic_with_seed(self):
+    for random_fn in [Tensor.randn, Tensor.normal, Tensor.uniform, Tensor.scaled_uniform, Tensor.glorot_uniform, Tensor.kaiming_normal]:
+      with self.subTest(msg=f"Tensor.{random_fn.__name__}"):
+        Tensor.manual_seed(1337)
+        a = random_fn(10,10).realize()
+        Tensor.manual_seed(1337)
+        b = random_fn(10,10).realize()
+        np.testing.assert_allclose(a.numpy(), b.numpy())
+
+  def test_randn_isnt_inf_on_zero(self):
+    # simulate failure case of rand handing a zero to randn
+    original_rand, Tensor.rand = Tensor.rand, Tensor.zeros
+    try: self.assertNotIn(np.inf, Tensor.randn(16).numpy())
+    except: raise
+    finally: Tensor.rand = original_rand
+
+  def test_zeros_like_has_same_dtype(self):
+    for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
+      a = Tensor([1, 2, 3], dtype=datatype)
+      b = Tensor.zeros_like(a)
+      assert a.dtype == b.dtype, f"a.dtype and b.dtype should be {datatype}"
+      assert a.shape == b.shape, f"shape mismatch (Tensor.zeros_like){a.shape} != (torch){b.shape}"
+
+    a = Tensor([1, 2, 3])
+    b = Tensor.zeros_like(a, dtype=dtypes.int8)
+    assert a.dtype != b.dtype and a.dtype == dtypes.float32 and b.dtype == dtypes.int8, "a.dtype should be float and b.dtype should be char"
+    assert a.shape == b.shape, f"shape mismatch (Tensor.zeros_like){a.shape} != (torch){b.shape}"
+
+  def test_ones_like_has_same_dtype_and_shape(self):
+    for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
+      a = Tensor([1, 2, 3], dtype=datatype)
+      b = Tensor.ones_like(a)
+      assert a.dtype == b.dtype, f"a.dtype and b.dtype should be {datatype}"
+      assert a.shape == b.shape, f"shape mismatch (Tensor.ones_like){a.shape} != (torch){b.shape}"
+
+    a = Tensor([1, 2, 3])
+    b = Tensor.ones_like(a, dtype=dtypes.int8)
+    assert a.dtype != b.dtype and a.dtype == dtypes.float32 and b.dtype == dtypes.int8, "a.dtype should be float and b.dtype should be char"
+    assert a.shape == b.shape, f"shape mismatch (Tensor.ones_like){a.shape} != (torch){b.shape}"
+
+  def test_ndim(self):
+    assert Tensor.randn(1).ndim == 1
+    assert Tensor.randn(2,2,2).ndim == 3
+    assert Tensor.randn(1,1,1,1,1,1).ndim == 6
+
+  def test_argfix(self):
+    self.assertEqual(Tensor.zeros().shape, ())
+    self.assertEqual(Tensor.ones().shape, ())
+
+    self.assertEqual(Tensor.zeros([]).shape, ())
+    self.assertEqual(Tensor.ones([]).shape, ())
+
+    self.assertEqual(Tensor.zeros(tuple()).shape, ())
+    self.assertEqual(Tensor.ones(tuple()).shape, ())
+
+    self.assertEqual(Tensor.zeros(1).shape, (1,))
+    self.assertEqual(Tensor.ones(1).shape, (1,))
+
+    self.assertEqual(Tensor.zeros(1,10,20).shape, (1,10,20))
+    self.assertEqual(Tensor.ones(1,10,20).shape, (1,10,20))
+
+    self.assertEqual(Tensor.zeros([1]).shape, (1,))
+    self.assertEqual(Tensor.ones([1]).shape, (1,))
+
+    self.assertEqual(Tensor.zeros([10,20,40]).shape, (10,20,40))
+    self.assertEqual(Tensor.ones([10,20,40]).shape, (10,20,40))
+
+  def test_numel(self):
+    assert Tensor.randn(10, 10).numel() == 100
+    assert Tensor.randn(1,2,5).numel() == 10
+    assert Tensor.randn(1,1,1,1,1,1).numel() == 1
+    assert Tensor([]).numel() == 0
+    # assert Tensor.randn(1,0,2,5) == 0 # TODO: fix empty tensors
+
+  def test_element_size(self):
+    for _, dtype in dtypes.fields().items():
+      assert dtype.itemsize == Tensor.randn(3, dtype=dtype).element_size(), f"Tensor.element_size() not matching Tensor.dtype.itemsize for {dtype}"
+
+  def test_deepwalk_ctx_check(self):
+    layer = Tensor.uniform(1, 1, requires_grad=True)
+    x = Tensor.randn(1, 1, 1)
+    x.dot(layer).mean().backward()
+    x = Tensor.randn(1, 1, 1)
+    x.dot(layer).mean().backward()
+
+  def test_zerosized_tensors(self):
+    Tensor([]).realize()
+    Tensor([]).numpy()
+
+  def test_tensor_ndarray_dtype(self):
+    arr = np.array([1]) # where dtype is implicitly int64
+    assert Tensor(arr).dtype == dtypes.int64
+    assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32 # check if ndarray correctly casts to Tensor dtype
+    assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64 # check that it works for something else
+
+  def test_tensor_list_dtype(self):
+    arr = [1]
+    assert Tensor(arr).dtype == Tensor.default_type
+    assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32
+    assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64
+
+  def test_tensor_copy(self):
+    x = copy.deepcopy(Tensor.ones((3,3,3)))
+    np.testing.assert_allclose(x.numpy(), np.ones((3,3,3)))
+
+  def test_copy_from_disk(self):
+    t = Tensor.randn(30, device="CPU").to(f"disk:{temp('test_copy_from_disk')}")
+    a = t[10:20]
+    dev = a.to(Device.DEFAULT)
+    np.testing.assert_allclose(a.numpy(), dev.numpy())
+
+  # Regression test for https://github.com/tinygrad/tinygrad/issues/1751
+  def test_copy_from_numpy_unaligned(self):
+    # 2**15 is the minimum for repro
+    arr = np.random.randn(2**15).astype(dtypes.float.np)
+    fn = temp('test_copy_from_numpy_unaligned')
+    with open(fn, 'wb') as f: f.write(b't' + arr.tobytes())
+    with open(fn, "a+b") as f: memview = memoryview(mmap.mmap(f.fileno(), arr.nbytes + 1))
+    ua_arr = np.frombuffer(memview[1:], dtype=arr.dtype, count=arr.shape[0])
+    np.testing.assert_allclose(arr, ua_arr)
+    assert not ua_arr.flags.aligned
+    # force device copy - to() is opt'd away - Tensor(dev)/1 is ignored
+    np.testing.assert_allclose(ua_arr, (Tensor(ua_arr)/Tensor(1)).numpy())
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/test_uops.py
+++ b/tinygrad_repo/test/test_uops.py
@@ -0,0 +1,99 @@
+from typing import Optional, Tuple, Any, List
+import unittest, math
+import numpy as np
+from tinygrad.helpers import dtypes, getenv, DType, PtrDType
+from tinygrad.tensor import Device
+from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ASTRunner, Compiled
+from tinygrad.codegen.linearizer import UOps, UOp
+
+def _uops_to_prg(uops):
+  src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
+  return ASTRunner("test", src,
+                   [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
+                   runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
+
+def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
+  uops.append(UOp(uop, dtype, tuple(vin), arg, len(uops)))
+  return uops[-1]
+
+def _test_single_value(vals, op, dtype):
+  uops = []
+  buf_store = uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), ('data0', dtype))
+  buf_loads = [uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), (f'data{i+1}', dtype)) for i in range(len(vals))]
+  loads = (uop(uops, UOps.LOAD, dtype, [buf_loads[i], uop(uops, UOps.CONST, dtypes.int32, (), 0)]) for i in range(len(vals)))
+  alu = uop(uops, UOps.ALU, dtype, loads, op)
+  uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
+  buf = Device[Device.DEFAULT].buffer(1, dtype)
+  buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
+  prg = _uops_to_prg(uops)
+  prg([buf]+buf2)
+  return buf.toCPU()[0]
+
+def _test_single_value_const(vals, op, dtype):
+  uops = []
+  buf_store = uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), ('data0', dtype))
+  loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
+  alu = uop(uops, UOps.ALU, dtype, loads, op)
+  uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
+  buf = Device[Device.DEFAULT].buffer(1, dtype)
+  prg = _uops_to_prg(uops)
+  prg([buf])
+  return buf.toCPU()[0]
+
+class TestUOps(unittest.TestCase):
+  def _equal(self, v1, v2):
+    if not (math.isnan(v1) and math.isnan(v2)): self.assertAlmostEqual(v1, v2, places=5)
+
+  def _test_uop_fxn(self, bop, fxn, dt=dtypes.float32):
+    for f in [_test_single_value, _test_single_value_const]:
+      for a in [-2.0, 0.0, 1.0]:
+        self._equal(f([a], bop, dt), fxn(a))
+
+  def _test_bop_fxn(self, bop, fxn, dt=dtypes.float32, no_b_zero=False):
+    for f in [_test_single_value, _test_single_value_const]:
+      for a in [-2.0, 0.0, 1.0]:
+        for b in [-3.0, 1.0] + ([] if no_b_zero else [0.0]):
+          self._equal(f([a,b], bop, dt), fxn(a,b))
+
+  def _test_top_fxn(self, bop, fxn, dt=dtypes.float32):
+    for f in [_test_single_value, _test_single_value_const]:
+      for a in [-2.0, 0, 1]:
+        for b in [-3.0, 3.0]:
+          for c in [-4.0, 4.0]:
+            self._equal(f([a,b,c], bop, dt), fxn(a,b,c))
+
+@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
+class TestFloatUOps(TestUOps):
+  def test_neg(self): self._test_uop_fxn(UnaryOps.NEG, lambda a: -a)
+  def test_exp2(self): self._test_uop_fxn(UnaryOps.EXP2, lambda a: np.exp2(a))
+  def test_log2(self): self._test_uop_fxn(UnaryOps.LOG2, lambda a: math.log2(a) if a > 0 else float('-inf' if a==0 else 'nan'))
+  def test_sin(self): self._test_uop_fxn(UnaryOps.SIN, lambda a: math.sin(a))
+  def test_sqrt(self): self._test_uop_fxn(UnaryOps.SQRT, lambda a: math.sqrt(a) if a >= 0 else float('nan'))
+  # this is not on most backends
+  #def test_recip(self): self._test_uop_fxn(UnaryOps.RECIP, lambda a: 1.0/a if a != 0 else float('inf'))
+
+  def test_add(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: a+b)
+  def test_sub(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: a-b)
+  def test_mul(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: a*b)
+  def test_div(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: a/b if b != 0 else a*float('inf'))
+  def test_max(self): self._test_bop_fxn(BinaryOps.MAX, lambda a,b: max(a,b))
+  def test_cmplt(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b))
+  # MOD isn't tested on floats
+
+  def test_mulacc(self): self._test_top_fxn(TernaryOps.MULACC, lambda a,b,c: (a*b)+c)
+  def test_where(self): self._test_top_fxn(TernaryOps.WHERE, lambda a,b,c: b if a!=0 else c)
+
+# TODO: fix this on all the backends
+@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or getenv('ARM64', False), "only test for compiled backends, broken on some")
+class TestNonFloatUOps(TestUOps):
+  def test_neg_int32(self): self._test_uop_fxn(UnaryOps.NEG, lambda a: -a, dtypes.int32)
+  def test_add_int32(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: int(a)+int(b), dtypes.int32)
+  def test_sub_int32(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: int(a)-int(b), dtypes.int32)
+  def test_mul_int32(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: int(a)*int(b), dtypes.int32)
+  def test_div_int32(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: int(a/b), dtypes.int32, no_b_zero=True)
+  def test_mod_int32(self): self._test_bop_fxn(BinaryOps.MOD, lambda a,b: abs(int(a))%abs(int(b))*(1,-1)[a<0], dtypes.int32, no_b_zero=True)
+  def test_cmplt_int32(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b), dtypes.int32)
+  def test_mul_bool(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: bool(a) and bool(b), dtypes.bool)
+
+if __name__ == '__main__':
+  unittest.main(verbosity=2)
--- a/tinygrad_repo/test/test_webgpu.js
+++ b/tinygrad_repo/test/test_webgpu.js
@@ -0,0 +1,51 @@
+const puppeteer = require('puppeteer');
+const { spawn } = require('child_process');
+const res = spawn("python", ["-m", "http.server", "8000"], { shell: true });
+
+async function timeout(time) {
+    return new Promise((resolve) => setTimeout(resolve, time));
+}
+
+function cleanup(err) {
+    res.kill();
+    if(err != null) {
+        console.error(err);
+        process.exit(1);        
+    }
+}
+
+async function waitForText(selector, text) {
+    let n = 0;
+    let ready = false;
+    while (n < 10) {
+        const res = await (await selector.getProperty("textContent")).jsonValue();
+        console.log(`waiting for text ${text} got ${res}`);
+        if(res == text) {
+            ready = true;
+            break
+        }
+        await timeout(2000);           
+        n += 1
+    }
+    return ready;
+}
+
+puppeteer.launch({ headless: false, args: ["--enable-unsafe-webgpu"]}).then(async browser => {
+    const page = await browser.newPage();
+    page.on("console", message => console.log(`message from console ${message.text()}`))
+        .on("pageerror", ({ message }) => console.log(`error from page ${message}`))
+
+    const res = await page.goto("http://localhost:8000/examples/index.html");
+    if(res.status() != 200) throw new Error("Failed to load page");
+    const textSelector = await page.waitForSelector("#result");
+    const buttonSelector = await page.waitForSelector("input[type=button]");
+    const ready = await waitForText(textSelector, "ready");
+    if(!ready) throw new Error("Failed to load page");
+    await buttonSelector.evaluate(e => e.click());
+    const done = await waitForText(textSelector, "hen");
+    if(!done) throw new Error("failed to get hen");
+    browser.close();
+    cleanup(null);
+}).catch(err => {
+    cleanup(err);
+});
--- a/tinygrad_repo/test/test_winograd.py
+++ b/tinygrad_repo/test/test_winograd.py
@@ -0,0 +1,40 @@
+import unittest
+from tinygrad.helpers import Timing, CI
+from tinygrad.tensor import Tensor
+from tinygrad.ops import LoadOps
+from tinygrad.codegen.linearizer import Linearizer
+from test.test_net_speed import start_profile, stop_profile
+
+class TestWinograd(unittest.TestCase):
+  def setUp(self):
+    self.old = Tensor.wino
+    Tensor.wino = 1
+  def tearDown(self): Tensor.wino = self.old
+
+  def test_speed(self):
+    x = Tensor.empty(1,4,9,9)
+    w = Tensor.empty(4,4,3,3)
+
+    with Timing("running conv: "):
+      out = Tensor.conv2d(x, w)
+
+    with Timing("scheduling: "):
+      sched = out.lazydata.schedule()
+
+    for i,s in enumerate(sched):
+      if s.ast.op in LoadOps: continue
+      ops = s.ast.get_lazyops()
+      with Timing(f"linearize {i} with {len(ops):4d} ops: "):
+        l = Linearizer(s.ast)
+        l.hand_coded_optimizations()
+        l.linearize()
+
+  def test_profile(self):
+    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
+    if not CI: pr = start_profile()
+    out = Tensor.conv2d(x,w).realize()
+    if not CI: stop_profile(pr, sort='time')
+    out.numpy()
+
+if __name__ == '__main__':
+  unittest.main(verbosity=2)
--- a/tinygrad_repo/test/unit/test_disk_cache.py
+++ b/tinygrad_repo/test/unit/test_disk_cache.py
@@ -0,0 +1,66 @@
+import unittest
+import pickle
+from tinygrad.helpers import diskcache_get, diskcache_put
+
+def remote_get(table,q,k): q.put(diskcache_get(table, k))
+def remote_put(table,k,v): diskcache_put(table, k, v)
+
+class DiskCache(unittest.TestCase):
+  def test_putget(self):
+    table = "test_putget"
+    diskcache_put(table, "hello", "world")
+    self.assertEqual(diskcache_get(table, "hello"), "world")
+    diskcache_put(table, "hello", "world2")
+    self.assertEqual(diskcache_get(table, "hello"), "world2")
+
+  def test_putcomplex(self):
+    table = "test_putcomplex"
+    diskcache_put(table, "k", ("complex", 123, "object"))
+    ret = diskcache_get(table, "k")
+    self.assertEqual(ret, ("complex", 123, "object"))
+
+  def test_getotherprocess(self):
+    table = "test_getotherprocess"
+    from multiprocessing import Process, Queue
+    diskcache_put(table, "k", "getme")
+    q = Queue()
+    p = Process(target=remote_get, args=(table,q,"k"))
+    p.start()
+    p.join()
+    self.assertEqual(q.get(), "getme")
+
+  def test_putotherprocess(self):
+    table = "test_putotherprocess"
+    from multiprocessing import Process
+    p = Process(target=remote_put, args=(table,"k", "remote"))
+    p.start()
+    p.join()
+    self.assertEqual(diskcache_get(table, "k"), "remote")
+
+  def test_no_table(self):
+    self.assertIsNone(diskcache_get("faketable", "k"))
+
+  def test_ret(self):
+    table = "test_ret"
+    self.assertEqual(diskcache_put(table, "key", ("vvs",)), ("vvs",))
+
+  def test_non_str_key(self):
+    table = "test_non_str_key"
+    diskcache_put(table, 4, 5)
+    self.assertEqual(diskcache_get(table, 4), 5)
+    self.assertEqual(diskcache_get(table, "4"), 5)
+
+  def test_dict_key(self):
+    table = "test_dict_key"
+    fancy_key = {"hello": "world", "goodbye": 7, "good": True, "pkl": pickle.dumps("cat")}
+    fancy_key2 = {"hello": "world", "goodbye": 8, "good": True, "pkl": pickle.dumps("cat")}
+    fancy_key3 = {"hello": "world", "goodbye": 8, "good": True, "pkl": pickle.dumps("dog")}
+    diskcache_put(table, fancy_key, 5)
+    self.assertEqual(diskcache_get(table, fancy_key), 5)
+    diskcache_put(table, fancy_key2, 8)
+    self.assertEqual(diskcache_get(table, fancy_key2), 8)
+    self.assertEqual(diskcache_get(table, fancy_key), 5)
+    self.assertEqual(diskcache_get(table, fancy_key3), None)
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/unit/test_disk_tensor.py
+++ b/tinygrad_repo/test/unit/test_disk_tensor.py
@@ -0,0 +1,150 @@
+import pathlib
+import unittest
+import numpy as np
+from tinygrad.tensor import Tensor, Device
+from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
+from tinygrad.helpers import dtypes
+from tinygrad.runtime.ops_disk import RawDiskBuffer
+from tinygrad.helpers import Timing
+from extra.utils import fetch_as_file, temp
+
+def compare_weights_both(url):
+  import torch
+  fn = fetch_as_file(url)
+  tg_weights = get_state_dict(torch_load(fn))
+  torch_weights = get_state_dict(torch.load(fn), tensor_type=torch.Tensor)
+  assert list(tg_weights.keys()) == list(torch_weights.keys())
+  for k in tg_weights:
+    np.testing.assert_equal(tg_weights[k].numpy(), torch_weights[k].numpy(), err_msg=f"mismatch at {k}, {tg_weights[k].shape}")
+  print(f"compared {len(tg_weights)} weights")
+
+class TestTorchLoad(unittest.TestCase):
+  # pytorch pkl format
+  def test_load_enet(self): compare_weights_both("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth")
+  # pytorch zip format
+  def test_load_enet_alt(self): compare_weights_both("https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth")
+  # pytorch zip format
+  def test_load_convnext(self): compare_weights_both('https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth')
+  # TODO: support pytorch tar format with minimal lines
+  #def test_load_resnet(self): compare_weights_both('https://download.pytorch.org/models/resnet50-19c8e357.pth')
+
+test_fn = pathlib.Path(__file__).parents[2] / "weights/LLaMA/7B/consolidated.00.pth"
+#test_size = test_fn.stat().st_size
+test_size = 1024*1024*1024*2
+
+# sudo su -c 'sync; echo 1 > /proc/sys/vm/drop_caches' && python3 test/unit/test_disk_tensor.py TestRawDiskBuffer.test_readinto_read_speed
+@unittest.skipIf(not test_fn.exists(), "download LLaMA weights for read in speed tests")
+class TestRawDiskBuffer(unittest.TestCase):
+  def test_readinto_read_speed(self):
+    tst = np.empty(test_size, np.uint8)
+    with open(test_fn, "rb") as f:
+      with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
+        f.readinto(tst)
+
+  def test_mmap_read_speed(self):
+    db = RawDiskBuffer(test_size, dtype=dtypes.uint8, device=test_fn)
+    tst = np.empty(test_size, np.uint8)
+    with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
+      np.copyto(tst, db.toCPU())
+@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu doesn't support uint8 datatype")
+class TestSafetensors(unittest.TestCase):
+  def test_real_safetensors(self):
+    import torch
+    from safetensors.torch import save_file
+    torch.manual_seed(1337)
+    tensors = {
+      "weight1": torch.randn((16, 16)),
+      "weight2": torch.arange(0, 17, dtype=torch.uint8),
+      "weight3": torch.arange(0, 17, dtype=torch.int32).reshape(17,1,1),
+      "weight4": torch.arange(0, 2, dtype=torch.uint8),
+    }
+    save_file(tensors, temp("model.safetensors"))
+
+    ret = safe_load(temp("model.safetensors"))
+    for k,v in tensors.items(): np.testing.assert_array_equal(ret[k].numpy(), v.numpy())
+    safe_save(ret, temp("model.safetensors_alt"))
+    with open(temp("model.safetensors"), "rb") as f:
+      with open(temp("model.safetensors_alt"), "rb") as g:
+        assert f.read() == g.read()
+    ret2 = safe_load(temp("model.safetensors_alt"))
+    for k,v in tensors.items(): np.testing.assert_array_equal(ret2[k].numpy(), v.numpy())
+
+  def test_efficientnet_safetensors(self):
+    from models.efficientnet import EfficientNet
+    model = EfficientNet(0)
+    state_dict = get_state_dict(model)
+    safe_save(state_dict, temp("eff0"))
+    state_dict_loaded = safe_load(temp("eff0"))
+    assert sorted(list(state_dict_loaded.keys())) == sorted(list(state_dict.keys()))
+    for k,v in state_dict.items():
+      np.testing.assert_array_equal(v.numpy(), state_dict_loaded[k].numpy())
+
+    # load with the real safetensors
+    from safetensors import safe_open
+    with safe_open(temp("eff0"), framework="pt", device="cpu") as f:
+      assert sorted(list(f.keys())) == sorted(list(state_dict.keys()))
+      for k in f.keys():
+        np.testing.assert_array_equal(f.get_tensor(k).numpy(), state_dict[k].numpy())
+
+  def test_huggingface_enet_safetensors(self):
+    # test a real file
+    fn = fetch_as_file("https://huggingface.co/timm/mobilenetv3_small_075.lamb_in1k/resolve/main/model.safetensors")
+    state_dict = safe_load(fn)
+    assert len(state_dict.keys()) == 244
+    assert 'blocks.2.2.se.conv_reduce.weight' in state_dict
+    assert state_dict['blocks.0.0.bn1.num_batches_tracked'].numpy() == 276570
+    assert state_dict['blocks.2.0.bn2.num_batches_tracked'].numpy() == 276570
+
+  def test_metadata(self):
+    metadata = {"hello": "world"}
+    safe_save({}, temp('metadata.safetensors'), metadata)
+    import struct
+    with open(temp('metadata.safetensors'), 'rb') as f:
+      dat = f.read()
+    sz = struct.unpack(">Q", dat[0:8])[0]
+    import json
+    assert json.loads(dat[8:8+sz])['__metadata__']['hello'] == 'world'
+
+def helper_test_disk_tensor(fn, data, np_fxn, tinygrad_fxn=None):
+  if tinygrad_fxn is None: tinygrad_fxn = np_fxn
+  pathlib.Path(temp(fn)).unlink(missing_ok=True)
+  tinygrad_tensor = Tensor(data, device="CPU").to(f"disk:{temp(fn)}")
+  numpy_arr = np.array(data)
+  tinygrad_fxn(tinygrad_tensor)
+  np_fxn(numpy_arr)
+  np.testing.assert_allclose(tinygrad_tensor.numpy(), numpy_arr)
+
+class TestDiskTensor(unittest.TestCase):
+  def test_empty(self):
+    pathlib.Path(temp("dt1")).unlink(missing_ok=True)
+    Tensor.empty(100, 100, device=f"disk:{temp('dt1')}")
+
+  def test_write_ones(self):
+    pathlib.Path(temp("dt2")).unlink(missing_ok=True)
+
+    out = Tensor.ones(10, 10, device="CPU")
+    outdisk = out.to(f"disk:{temp('dt2')}")
+    print(outdisk)
+    outdisk.realize()
+    del out, outdisk
+
+    # test file
+    with open(temp("dt2"), "rb") as f:
+      assert f.read() == b"\x00\x00\x80\x3F" * 100
+
+    # test load alt
+    reloaded = Tensor.empty(10, 10, device=f"disk:{temp('dt2')}")
+    out = reloaded.numpy()
+    assert np.all(out == 1.)
+
+  def test_assign_slice(self):
+    def assign(x,s,y): x[s] = y
+    helper_test_disk_tensor("dt3", [0,1,2,3], lambda x: assign(x, slice(0,2), [13, 12]))
+    helper_test_disk_tensor("dt4", [[0,1,2,3],[4,5,6,7]], lambda x: assign(x, slice(0,1), [[13, 12, 11, 10]]))
+
+  def test_reshape(self):
+    helper_test_disk_tensor("dt5", [1,2,3,4,5], lambda x: x.reshape((1,5)))
+    helper_test_disk_tensor("dt6", [1,2,3,4], lambda x: x.reshape((2,2)))
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/unit/test_flopcounter.py
+++ b/tinygrad_repo/test/unit/test_flopcounter.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.ops import LazyOp, BinaryOps, ReduceOps, get_lazyop_info, BufferOps, MemBuffer
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.helpers import dtypes
+
+class TestFlopCounter(unittest.TestCase):
+  def setUp(self):
+    self.buf0 = LazyOp(BufferOps.MEM, (), MemBuffer(1, dtypes.float32, ShapeTracker.from_shape((4,))))
+    self.buf1 = LazyOp(BufferOps.MEM, (), MemBuffer(2, dtypes.float32, ShapeTracker.from_shape((4,))))
+
+  def test_flops_add(self):
+    op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
+    info = get_lazyop_info(op0)
+    self.assertEqual(info.flops, 4)
+
+  def test_flops_add_twice(self):
+    op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
+    op1 = LazyOp(BinaryOps.ADD, (op0,self.buf1,), None)
+    info = get_lazyop_info(op1)
+    self.assertEqual(info.flops, 8)
+
+  def test_flops_add_self(self):
+    op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
+    op1 = LazyOp(BinaryOps.ADD, (op0,op0,), None)
+    info = get_lazyop_info(op1)
+    self.assertEqual(info.flops, 8)
+
+  def test_flops_add_roundabout_self(self):
+    op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
+    op1 = LazyOp(BinaryOps.ADD, (op0,self.buf1,), None)
+    op2 = LazyOp(BinaryOps.ADD, (op0,op1,), None)
+    info = get_lazyop_info(op2)
+    self.assertEqual(info.flops, 12)
+
+  def test_flops_red(self):
+    op0 = LazyOp(BinaryOps.MUL, (self.buf0,self.buf1,), None)
+    op1 = LazyOp(ReduceOps.SUM, (op0,), (1,))
+    op2 = LazyOp(BinaryOps.ADD, (op1, op1,), None)
+    info = get_lazyop_info(op2)
+    self.assertEqual(info.flops, 9)
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/unit/test_helpers.py
+++ b/tinygrad_repo/test/unit/test_helpers.py
@@ -0,0 +1,142 @@
+import unittest
+import numpy as np
+from tinygrad.helpers import Context, ContextVar, DType, dtypes, merge_dicts, strip_parens, prod
+from tinygrad.shape.symbolic import Variable, NumNode
+
+VARIABLE = ContextVar("VARIABLE", 0)
+
+class TestContextVars(unittest.TestCase):
+  # Ensuring that the test does not modify variables outside the tests.
+  ctx = Context()
+  def setUp(self): TestContextVars.ctx.__enter__()
+  def tearDown(self): TestContextVars.ctx.__exit__()
+
+  def test_initial_value_is_set(self):
+    _TMP = ContextVar("_TMP", 5)
+    self.assertEqual(_TMP.value, 5)
+
+  def test_multiple_creation_ignored(self):
+    _TMP2 = ContextVar("_TMP2", 1)
+    _TMP2 = ContextVar("_TMP2", 2)
+    self.assertEqual(_TMP2.value, 1)
+
+  def test_new_var_inside_context(self):
+    # Creating a _new_ variable inside a context should not have any effect on its scope (?)
+    with Context(VARIABLE=1):
+      _TMP3 = ContextVar("_TMP3", 1)
+    _TMP3 = ContextVar("_TMP3", 2)
+    self.assertEqual(_TMP3.value, 1)
+
+  def test_value_accross_modules(self):
+    # Mocking module import by invoking the code but not in our globals().
+    exec('from tinygrad.helpers import ContextVar;C = ContextVar("C", 13)', {}) # pylint:disable=exec-used
+    # It should not matter that the first creation was in another module.
+    C = ContextVar("C", 0)
+    self.assertEqual(C.value, 13)
+
+  def test_assignment_across_modules(self):
+    B = ContextVar("B", 1)
+    # local assignment
+    B.value = 2
+    self.assertEqual(B.value, 2)
+    # Assignment in another module.
+    exec('from tinygrad.helpers import ContextVar;B = ContextVar("B", 0);B.value = 3;', {}) # pylint:disable=exec-used
+    # Assignment in another module should affect this one as well.
+    self.assertEqual(B.value, 3)
+
+  def test_context_assignment(self):
+    with Context(VARIABLE=1):
+      self.assertEqual(VARIABLE.value, 1)
+    self.assertEqual(VARIABLE.value, 0)
+
+  def test_unknown_param_to_context(self):
+    with self.assertRaises(KeyError):
+      with Context(SOMETHING_ELSE=1):
+        pass
+
+  def test_inside_context_assignment(self):
+    with Context(VARIABLE=4):
+      # What you can and cannot do inside a context.
+      # 1. This type of statement has no effect.
+      VARIABLE = ContextVar("VARIABLE", 0)
+      self.assertTrue(VARIABLE >= 4, "ContextVars inside contextmanager may not set a new value")
+
+      # 2. The call syntax however has a local effect.
+      VARIABLE.value = 13
+      self.assertTrue(VARIABLE.value == 13, "Call syntax however works inside a contextmanager.")
+
+    # Related to 2. above. Note that VARIABLE is back to 0 again as expected.
+    self.assertEqual(VARIABLE.value, 0)
+
+  def test_new_var_inside_context_other_module(self):
+    with Context(VARIABLE=1):
+      _NEW2 = ContextVar("_NEW2", 0)
+    _NEW2 = ContextVar("_NEW2", 1)
+    self.assertEqual(_NEW2.value, 0)
+
+    code = """\
+from tinygrad.helpers import Context, ContextVar
+with Context(VARIABLE=1):
+  _NEW3 = ContextVar("_NEW3", 0)"""
+    exec(code, {})  # pylint:disable=exec-used
+    # While _NEW3 was created in an outside scope it should still work the same as above.
+    _NEW3 = ContextVar("_NEW3", 1)
+    self.assertEqual(_NEW3.value, 0)
+
+  def test_nested_context(self):
+    with Context(VARIABLE=1):
+      with Context(VARIABLE=2):
+        with Context(VARIABLE=3):
+          self.assertEqual(VARIABLE.value, 3)
+        self.assertEqual(VARIABLE.value, 2)
+      self.assertEqual(VARIABLE.value, 1)
+    self.assertEqual(VARIABLE.value, 0)
+
+  def test_decorator(self):
+    @Context(VARIABLE=1, DEBUG=4)
+    def test():
+      self.assertEqual(VARIABLE.value, 1)
+
+    self.assertEqual(VARIABLE.value, 0)
+    test()
+    self.assertEqual(VARIABLE.value, 0)
+
+  def test_context_exit_reverts_updated_values(self):
+    D = ContextVar("D", 1)
+    D.value = 2
+    with Context(D=3):
+      ...
+    assert D.value == 2, f"Expected D to be 2, but was {D.value}. Indicates that Context.__exit__ did not restore to the correct value."
+
+class TestMergeDicts(unittest.TestCase):
+  def test_merge_dicts(self):
+    a = {"a": 1, "b": 2}
+    b = {"a": 1, "c": 3}
+    c = {}
+    d = {"a": 2, "b": 2}
+    assert merge_dicts([a, b]) == {"a": 1, "b": 2, "c": 3}
+    assert merge_dicts([a, c]) == a
+    assert merge_dicts([a, b, c]) == {"a": 1, "b": 2, "c": 3}
+    with self.assertRaises(AssertionError):
+      merge_dicts([a, d])
+
+class TestDtypes(unittest.TestCase):
+  def test_dtypes_fields(self):
+    fields = dtypes.fields()
+    self.assertTrue(all(isinstance(value, DType) for value in fields.values()))
+    self.assertTrue(all(issubclass(value.np, np.generic) for value in fields.values() if value.np is not None))
+
+class TestStripParens(unittest.TestCase):
+  def test_simple(self): self.assertEqual("1+2", strip_parens("(1+2)"))
+  def test_nested(self): self.assertEqual("1+(2+3)", strip_parens("(1+(2+3))"))
+  def test_casted_no_strip(self): self.assertEqual("(int)(1+2)", strip_parens("(int)(1+2)"))
+
+class TestProd(unittest.TestCase):
+  def test_empty(self): self.assertEqual(1, prod(tuple()))
+  def test_ints(self): self.assertEqual(30, prod((2, 3, 5)))
+  def test_variable(self): self.assertEqual("(a*12)", prod((Variable("a", 1, 5), 3, 4)).render())
+  def test_variable_order(self): self.assertEqual("(a*12)", prod((3, 4, Variable("a", 1, 5))).render())
+  def test_num_nodes(self): self.assertEqual(NumNode(6), prod((NumNode(2), NumNode(3))))
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/unit/test_shapetracker.py
+++ b/tinygrad_repo/test/unit/test_shapetracker.py
@@ -0,0 +1,663 @@
+#!/usr/bin/env python
+import unittest
+import numpy as np
+from tinygrad.helpers import prod, DEBUG
+from tinygrad.shape.shapetracker import ShapeTracker, View, get_contraction
+from tinygrad.shape.symbolic import Variable
+from itertools import product
+
+def shapetracker_getitem(st, val):
+  locals = {"idx": val, "valid": 1}
+  idx, valid = st.expr_node()
+  exec(f"valid={valid.render()};idx={idx.render()}", None, locals)
+  return locals["idx"] if locals["valid"] else -1
+
+class CheckingShapeTracker:
+  def __init__(self, shape):
+    self.st = ShapeTracker.from_shape(shape)
+    self.t = np.arange(prod(shape), dtype=np.int32).reshape(shape)
+
+  @property
+  def shape(self):
+    return self.t.shape
+
+  def simplify(self):
+    self.st = self.st.simplify()
+    return self
+
+  def reshape(self, new_shape):
+    self.st = self.st.reshape(new_shape)
+    self.t = self.t.reshape(new_shape)
+    return self
+
+  def permute(self, axis):
+    self.st = self.st.permute(axis)
+    self.t = np.transpose(self.t, axis)
+    return self
+
+  def expand(self, new_shape):
+    self.st = self.st.expand(new_shape)
+    self.t = np.broadcast_to(self.t, new_shape)
+    return self
+
+  def flip(self, axis):
+    self.st = self.st.stride(tuple(-1 if i in axis else 1 for i in range(len(self.shape))))
+    self.t = np.flip(self.t, axis)
+    return self
+
+  def shrink(self, arg):
+    self.st = self.st.shrink(arg)
+    self.t = self.t[tuple([slice(x[0], x[1]) for x in arg])]
+    return self
+
+  def pad(self, arg):
+    self.st = self.st.pad(arg)
+    self.t = np.pad(self.t, arg, constant_values=-1)
+    return self
+
+  def stride(self, arg):
+    self.st = self.st.stride(arg)
+    self.t = self.t[tuple([slice(None, None, x) for x in arg])]
+    return self
+
+  def __getitem__(self, val):
+    return self.t.flatten()[val]
+
+  @property
+  def views(self): return self.st.views
+
+  @property
+  def contiguous(self): return self.st.contiguous
+
+  def assert_same(self):
+    x = [shapetracker_getitem(self.st, i) for i in range(prod(self.st.shape))]
+    y = [self[i] for i in range(prod(self.shape))]
+    idx, valid = self.st.expr_node()
+    if DEBUG >= 1: print(x, y, self.st.shape, self.shape, idx.render(), valid.render(), self.st)
+    assert self.st.shape == self.shape
+    assert x == y, f"mismatch shapetracker:{x} real:{y}"
+
+class TestRealIssues(unittest.TestCase):
+  def test_reshape_doesnt_multiview(self):
+    self.st = ShapeTracker((View.create((256, 256, 2, 2, 2, 2, 2, 256, 8, 2), (0, 8, 0, 4, 0, 0, 2, 16384, 2048, 1), 0, None),))
+    self.st.reshape((128, 2, 256, 2, 2, 2, 2, 2, 256, 8, 2))
+    assert len(self.st.views) == 1
+
+class TestRealDoesntSimplify(unittest.TestCase):
+  def tearDown(self):
+    st = self.st.real_strides()
+    print(st)
+    self.st = self.st.simplify()
+    assert len(self.st.views) != 1
+    assert None in st
+
+  def test_1(self):
+    self.st = ShapeTracker((
+      View.create((8, 3, 1, 2, 11, 1), (33, 11, 0, 0, 1, 0), 0, None),
+      View.create((8, 6, 11), (66, 11, 1), 0, None)))
+    assert self.st.real_strides() == (33, None, 1)
+
+  def test_2(self):
+    self.st = ShapeTracker((
+      View.create((2, 2, 4, 3, 3), (72, 9, 18, -3, -1), 8, None),
+      View.create((4, 4, 3, 3), (36, 9, 3, 1), 0, None)))
+    assert self.st.real_strides() == (None, 18, -3, -1)
+
+class TestRealStrides(unittest.TestCase):
+  def test_1(self):
+    self.st = ShapeTracker((
+      View.create((2048,), (1,), 0, ((0, 512),)),
+      View.create((16, 32, 4), (128, 4, 1), 0, None)))
+    st = self.st.real_strides()
+    print(self.st, st)
+    assert st == (None, 4, 1)
+
+class TestRealSimplifies(unittest.TestCase):
+  def tearDown(self):
+    st = self.st.real_strides()
+    self.st = self.st.simplify()
+    assert len(self.st.views) == 1
+    print(self.st.views[-1].strides, st)
+    assert self.st.views[-1].strides == st
+
+  def test_1(self):
+    self.st = ShapeTracker((
+      View.create((1, 3, 2, 11, 4, 28), (0, 308, 0, 28, 0, 1), 0, None),
+      View.create((1, 3, 2, 11, 26, 1, 1, 3), (0, 2464, 0, 112, 1, 0, 0, 29), 0, None)))
+
+  def test_2(self):
+    self.st = ShapeTracker((
+      View.create((8, 3, 3, 11, 2, 28), (924, 308, 0, 28, 0, 1), 0, None),
+      View.create((8, 1, 6, 10, 28, 3, 2, 1), (5544, 0, 0, 56, 1, 1848, 672, 0), 0, None)))
+
+class TestIndexExpressions2d(unittest.TestCase):
+
+  def setUp(self):
+    shapes = [(30, 5), (15, 10), (15, 1), (5, 10), (5, 1)] # Make sure dim0 is a multiple of 5, one of the tests divides this dimension by 5
+    offsets = [0, 1, 15, 28, 10000]
+    self.sts = [ShapeTracker((View.create(base_shape, offset=offset),)) for base_shape in shapes for offset in offsets]
+    self.offset = [Variable.num(offset) for base_shape in shapes for offset in offsets]
+    self.shapes = [shape for shape in shapes for offset in offsets]
+    self.node_exprs = []
+    self.idxs_exprs = []
+
+  def tearDown(self):
+    for st, offset, shape, node_expr, idxs_expr in zip(self.sts, self.offset, self.shapes, self.node_exprs, self.idxs_exprs):
+      numel = prod(shape)
+      assert node_expr(self.default_idx(st.shape)) == st.expr_node()[0]
+      assert node_expr(self.default_idx(st.shape)) == st.expr_node(None)[0]
+      assert node_expr(self.default_idx(st.shape)) == st.expr_node('idx')[0]
+      self.check_bounds(node_expr(self.default_idx(st.shape)), offset, numel)
+      for idx in [(0, numel-1), (7, 203), (2, 5), (0, 0), (numel, numel), (0, numel), (0, numel+1), (numel+100, numel+100)]:
+        idx = Variable("idx", idx[0], idx[1])
+        assert node_expr(idx) == st.expr_node(idx)[0]
+        self.check_bounds(node_expr(idx), offset, numel)
+
+      assert idxs_expr(self.default_idxs(st.shape)) == st.expr_idxs()[0]
+      assert idxs_expr(self.default_idxs(st.shape)) == st.expr_idxs(None)[0]
+      self.check_bounds(idxs_expr(self.default_idxs(st.shape)), offset, numel)
+      idx0s = [(0,0), (0, min(1, st.shape[0]-1)), (0, st.shape[0]-1), (min(3, st.shape[0]-1), min(6, st.shape[0]-1)), (st.shape[0]-1, st.shape[0]-1)]
+      idx1s = [(0,0), (0, min(1, st.shape[1]-1)), (0, st.shape[1]-1), (min(3, st.shape[1]-1), min(6, st.shape[1]-1)), (st.shape[1]-1, st.shape[1]-1)]
+      idx2s = [(0,0), (0, min(1, st.shape[2]-1)), (0, st.shape[2]-1), (min(3, st.shape[2]-1), min(6, st.shape[2]-1)), (st.shape[2]-1, st.shape[2]-1)] if len(st.shape) == 3 else [None for _ in idx0s]
+      for idx0, idx1, idx2 in product(idx0s, idx1s, idx2s):
+        idxs = [Variable(f"idx{i}", idx[0], idx[1]) for i, idx in enumerate((idx0, idx1, idx2)) if idx is not None]
+        assert idxs_expr(idxs) == st.expr_idxs(idxs)[0]
+        self.check_bounds(idxs_expr(idxs), offset, numel)
+
+  def default_idx(self, shape):
+    return Variable("idx", 0, prod(shape)-1)
+
+  def default_idxs(self, shape):
+    return [Variable(f"idx{i}", 0, d-1) for i,d in enumerate(shape)]
+
+  def check_bounds(self, expr, offset, numel):
+    assert expr.min >= offset
+    assert expr.max <= offset + numel - 1
+
+  def test_noop(self):
+    for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
+      self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape) + offset)
+      self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[1] + offset)
+
+  def test_permute(self):
+    new_st = []
+    for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
+      st = st.permute((1, 0))
+      self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
+      self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0] + idxs[1]*base_shape[1] + offset)
+      new_st.append(st)
+    self.sts = new_st
+
+  def test_reshape(self):
+    new_st = []
+    for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
+      st = st.reshape((base_shape[0], 1, base_shape[1]))
+      self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape)  + offset)
+      self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset)
+      new_st.append(st)
+    self.sts = new_st
+
+  def test_reshape_expand(self):
+    new_st = []
+    for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
+      st = st.reshape((base_shape[0], 1, base_shape[1]))
+      st = st.expand((base_shape[0], base_shape[1], base_shape[1]))
+      self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx//(base_shape[1]*base_shape[1])%base_shape[0]*base_shape[1] + idx%base_shape[1] + offset)
+      self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset)
+      new_st.append(st)
+    self.sts = new_st
+
+  def test_permute_reshape_1(self): # This tests multiple views
+    new_st = []
+    for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
+      st = st.permute((1, 0))
+      st = st.reshape((base_shape[0]//5, 1, base_shape[1]*5))
+      self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape)%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
+      self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: (idxs[0]*(base_shape[1]*5)+idxs[2])%base_shape[0]*base_shape[1] + (idxs[0]*(base_shape[1]*5)+idxs[2])//base_shape[0] + offset)
+      new_st.append(st)
+    self.sts = new_st
+
+  def test_permute_reshape_2(self):
+    new_st = []
+    for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
+      st = st.permute((1, 0))
+      st = st.reshape((1, base_shape[0]//5, base_shape[1]*5))
+      self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape)%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
+      self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: (idxs[1]*(base_shape[1]*5)+idxs[2])%base_shape[0]*base_shape[1] + (idxs[1]*(base_shape[1]*5)+idxs[2])//base_shape[0] + offset)
+      new_st.append(st)
+    self.sts = new_st
+
+class TestSimplifyingShapeTracker(unittest.TestCase):
+  def setUp(self):
+    self.st = CheckingShapeTracker((1, 10))
+
+  def tearDown(self):
+    self.st.assert_same()
+
+  # multiview simplify
+  def test_expand_contract_simple(self):
+    self.st = self.st.expand((10, 10))
+    self.st = self.st.reshape((100,))
+    print(self.st.views)
+    assert(len(self.st.views) == 2)
+    self.st = self.st.reshape((10, 10))
+    print(self.st.views)
+
+    self.st = self.st.simplify()
+    print(self.st.views)
+    assert(len(self.st.views) == 1)
+
+  # multiview simplify
+  def test_expand_contract_different_shape(self):
+    self.st.expand((10, 10))
+    self.st.reshape((100,))
+    print(self.st.views)
+    assert(len(self.st.views) == 2)
+    self.st.reshape((2, 5, 2, 5))
+    print(self.st.views)
+
+    self.st = self.st.simplify()
+    print(self.st.views)
+    assert(len(self.st.views) == 1)
+
+  # multiview simplify
+  def test_expand_contract_still_complex(self):
+    self.st.expand((10, 10))
+    self.st.reshape((100,))
+    print(self.st.views)
+    assert(len(self.st.views) == 2)
+    self.st.reshape((5, 20))
+
+    self.st = self.st.simplify()
+    print(self.st.views)
+    assert(len(self.st.views) == 2)
+
+# Tensor.zeros(2, 4).permute(1,0).reshape(2, 4)
+# (d1*4 + d0%4), d1=x//4, d0=x%4 = ((x//4)*4) + (x%4)%4
+
+class TestComplexShapeTracker(unittest.TestCase):
+  def test_add_1s(self):
+    self.st = CheckingShapeTracker((4, 4))
+    self.st.permute((1,0))
+    self.st.reshape((1,4,1,4,1))
+    assert not self.st.contiguous
+    self.st.permute((0,3,2,1,4))
+    assert self.st.contiguous
+
+  def test_permute_1s_simple(self):
+    self.st = CheckingShapeTracker((1, 16, 9,9))
+    self.st.permute((1,0,2,3))
+    assert self.st.contiguous
+    self.st = CheckingShapeTracker((2, 16, 9,9))
+    self.st.permute((1,0,2,3))
+    assert not self.st.contiguous
+
+  def test_remove_1s_simple(self):
+    self.st = CheckingShapeTracker((1, 16, 1, 1))
+    self.st.reshape((16,))
+    assert self.st.contiguous
+
+  def test_remove_1s(self):
+    self.st = CheckingShapeTracker((1, 4, 1, 4, 1))
+    self.st.permute((0,3,2,1,4))
+    self.st.reshape((4,4))
+    assert not self.st.contiguous
+    self.st.permute((1,0))
+    assert self.st.contiguous
+
+  def test_permute_reshape(self):
+    self.st = CheckingShapeTracker((4, 4))
+    self.st.permute((1,0))
+    self.st.reshape((2, 2, 2, 2))
+    # TODO: should also be tested by test_super_complex
+    assert len(self.st.views) == 1
+
+  def test_factorize_split(self):
+    self.st = CheckingShapeTracker((4, 4))
+    self.st.permute((1,0))
+    self.st.reshape((2, 2, 2, 2))
+    self.st.permute((2,3,0,1))
+    assert self.st.contiguous
+
+  def test_factorize_combine(self):
+    self.st = CheckingShapeTracker((4, 4, 4))
+    self.st.permute((2, 0, 1))
+    self.st.reshape((4, 16))
+    self.st.permute((1, 0))
+    assert self.st.contiguous
+
+  def test_factorize_combine_add_ones(self):
+    self.st = CheckingShapeTracker((4, 4, 4))
+    self.st.permute((2, 0, 1))
+    self.st.reshape((4, 16, 1, 1))
+    self.st.permute((1, 0, 2, 3))
+    assert self.st.contiguous
+
+  def test_fancy_factorize(self):
+    self.st = CheckingShapeTracker((32, 3, 3, 1))
+    self.st.reshape((8, 4, 3, 3))
+    assert len(self.st.views) == 1
+
+  def test_super_complex_2_fail(self):
+    self.st = CheckingShapeTracker((4, 4, 4))
+    self.st.permute((2, 0, 1))
+    self.st.reshape((16, 4))
+    assert len(self.st.views) != 1
+
+  def test_work(self):
+    self.st = CheckingShapeTracker((64, 1024, 4))
+    self.st.reshape((1, 64, 128, 32))
+    self.st.permute((0, 3, 1, 2))
+    self.st.reshape((1, 32, 1, 64, 128))
+    self.st.permute((0, 3, 4, 1, 2))
+    assert self.st.contiguous
+
+  def test_work2(self):
+    self.st = CheckingShapeTracker((64, 1024, 4))
+    self.st.reshape((1, 64, 128, 32))
+    self.st.permute((0, 3, 1, 2))
+    self.st.reshape((1, 1, 32, 64, 128))
+    self.st.permute((0, 3, 4, 1, 2))
+    self.st.reshape((64, 1024, 4))
+    print(self.st.views)
+    assert self.st.contiguous
+
+class TestSingleShapeTracker(unittest.TestCase):
+  def setUp(self):
+    self.st = CheckingShapeTracker((7,4))
+
+  def tearDown(self):
+    self.st.assert_same()
+
+  def test_reshape(self):
+    self.st.reshape((7,1,4))
+    assert self.st.contiguous
+
+  def test_permute(self):
+    self.st.permute((1,0))
+    assert not self.st.contiguous
+
+  def test_shrink(self):
+    self.st.shrink(((1,2), (0,4)))
+    assert not self.st.contiguous
+
+  def test_double_permute(self):
+    self.st.permute((1,0))
+    self.st.permute((1,0))
+    assert self.st.contiguous
+
+  def test_reshape_permute(self):
+    self.st.reshape((7,1,4))
+    self.st.permute((0,1,2))
+    assert self.st.contiguous
+
+  def test_reshape_permute_yes(self):
+    self.st.reshape((7,1,4))
+    self.st.permute((0,2,1))
+    assert self.st.contiguous
+
+  def test_reshape_permute_no(self):
+    self.st.reshape((4,7))
+    self.st.permute((1,0))
+    assert not self.st.contiguous
+
+class TestShapeTrackerFuzzFailures(unittest.TestCase):
+  def setUp(self):
+    self.st = CheckingShapeTracker((3,3,3))
+  def tearDown(self):
+    self.st.assert_same()
+  @unittest.skip("simplify doesn't work in this case")
+  def test_case_1(self):
+    self.st.shrink(((1, 2), (1, 3), (1, 3)))
+    self.st.reshape((1, 4))
+    self.st.shrink(((0, 1), (1, 3)))
+    print(self.st.st)
+    self.st = self.st.simplify()
+    print(self.st.st)
+  def test_case_2(self):
+    self.st.stride( (1, 1, -2) )
+    self.st.reshape( (3, 6) )
+    self.st.shrink( ((1, 2), (1, 5)) )
+    self.st.stride( (1, -1) )
+  def test_case_3(self):
+    self.st.shrink( ((0, 2), (0, 2), (0, 1)) )
+    self.st.permute( (1, 0, 2) )
+    self.st.reshape( (4,) )
+    self.st.shrink( ((0, 3),) )
+    self.st.stride( (-1,) )
+  def test_case_4(self):
+    self.st.reshape( (3, 3, 3, 1) )
+    self.st.pad( ((0, 0), (0, 0), (0, 0), (1, 1)) )
+    self.st.shrink( ((0, 2), (1, 2), (0, 2), (0, 1)) )
+    self.st.expand( (2, 1, 2, 3) )
+
+class TestMaskedShapeTracker(unittest.TestCase):
+  def test_pad_1x1(self):
+    self.st = CheckingShapeTracker((1,1))
+    self.st.pad(((1,1), (1,1)))
+    self.st.assert_same()
+
+  def test_pad_2x2(self):
+    self.st = CheckingShapeTracker((2,2))
+    self.st.pad(((1,1), (1,1)))
+    self.st.assert_same()
+
+class TestShapeTracker(unittest.TestCase):
+  def setUp(self):
+    self.st = CheckingShapeTracker((7,4))
+    self.apply = lambda fxn: [fxn(x) for x in [self.st]]
+
+  def tearDown(self):
+    self.st.assert_same()
+
+  def test_noop(self):
+    pass
+
+  def test_simple_split(self):
+    self.test_permute()
+    self.apply(lambda x: x.reshape((prod(self.st.shape), )))
+
+  def test_simple_pad(self):
+    self.st.pad(((1,1), (1,1)))
+
+  def test_pad_shrink(self):
+    self.st.pad(((1,1), (1,1)))
+    self.st.shrink(((0,4), (0,4)))
+
+  def test_pad_one_sided(self):
+    self.st.pad(((0,1), (0,0)))
+
+  def test_pad_reshape(self):
+    self.st.pad(((0,1), (0,0)))
+    self.st.reshape((8*4,))
+
+  def test_pad_pad(self):
+    self.st.pad(((1,1), (1,1)))
+    self.st.pad(((1,1), (1,1)))
+
+  def test_pad_permute(self):
+    self.st.pad(((1,1), (2,2)))
+    self.st.permute((1,0))
+
+  def test_pad_expand(self):
+    self.st.reshape((7,4,1))
+    self.st.pad(((1,1), (1,1), (0,0)))
+    self.st.expand((9,6,4))
+
+  def test_pad_expand_alt(self):
+    self.st.pad(((1,1), (1,1)))
+    self.st.reshape((9,6,1))
+    self.st.expand((9,6,4))
+
+  def test_pad_stride(self):
+    self.st.pad(((1,4), (1,3)))
+    self.st.stride((2,2))
+
+  def test_pad_stride_neg(self):
+    self.st.pad(((1,2), (1,0)))
+    self.st.stride((-1,-1))
+
+  def test_pad_stride_both(self):
+    self.st.pad(((1,2), (1,0)))
+    self.st.stride((-2,-2))
+
+  def test_shrink_pad(self):
+    self.st.shrink(((0,4), (0,4)))
+    self.st.pad(((1,1), (1,1)))
+
+  def test_reshape(self):
+    new_shape = self.st.shape[::-1]
+    self.apply(lambda x: x.reshape(new_shape))
+
+  def test_permute(self):
+    if len(self.st.shape) == 2: self.apply(lambda x: x.permute((1,0)))
+    elif len(self.st.shape) == 3: self.apply(lambda x: x.permute((2,0,1)))
+
+  def test_reshape_with_1(self):
+    new_shape = (self.st.shape[0], 1, self.st.shape[1])
+    self.apply(lambda x: x.reshape(new_shape))
+
+  def test_expand(self):
+    self.test_reshape_with_1()
+    new_shape = list(self.st.shape)
+    new_shape[1] = 2
+    self.apply(lambda x: x.expand(tuple(new_shape)))
+
+  def test_flip_0(self):
+    self.apply(lambda x: x.flip((0,)))
+
+  def test_flip_1(self):
+    self.apply(lambda x: x.flip((1,)))
+
+  def test_flip_01(self):
+    self.apply(lambda x: x.flip((0,1)))
+
+  def test_slice_0(self):
+    self.apply(lambda x: x.shrink(((1, x.shape[0]), (0, x.shape[1]))))
+
+  def test_slice_1(self):
+    self.apply(lambda x: x.shrink(((0, x.shape[0]), (1, x.shape[1]))))
+
+  def test_slice_1c1(self):
+    self.apply(lambda x: x.shrink(((0, 1), (0, 1))))
+
+  def test_slice_1c2(self):
+    self.apply(lambda x: x.shrink(((1, 2), (1, 2))))
+
+  def test_double_permute(self):
+    self.apply(lambda x: x.permute((1, 0)))
+    self.apply(lambda x: x.permute((1, 0)))
+
+  def test_slice_permute(self):
+    self.apply(lambda x: x.shrink(((0, 2), (2, 4))))
+    self.apply(lambda x: x.permute((1, 0)))
+
+  def test_slice_expand(self):
+    self.apply(lambda x: x.shrink(((0, 2), (3, 4))))
+    self.apply(lambda x: x.expand((2, 10)))
+
+  def test_double_stride(self):
+    self.apply(lambda x: x.stride((1, 2)))
+    self.apply(lambda x: x.stride((2, 1)))
+
+  def test_stride(self): self.apply(lambda x: x.stride((2,1)))
+  def test_stride_int(self): self.apply(lambda x: x.stride((1,2)))
+  def test_stride_2(self): self.apply(lambda x: x.stride((2,2)))
+  def test_stride_n(self): self.apply(lambda x: x.stride((-2,1)))
+  def test_stride_int_n(self): self.apply(lambda x: x.stride((-1,2)))
+  def test_stride_2_n(self): self.apply(lambda x: x.stride((-2,-2)))
+
+  def test_reshape_then_permute(self):
+    self.test_reshape()
+    self.test_permute()
+
+  def test_reshape_then_expand(self):
+    self.test_reshape()
+    self.test_expand()
+
+  def test_permute_then_reshape(self):
+    self.test_permute()
+    self.test_reshape()
+
+  def test_expand_then_reshape(self):
+    self.test_expand()
+    self.test_reshape()
+
+  def test_combo(self):
+    self.test_permute()
+    self.test_reshape()
+    self.test_slice_1()
+    self.test_expand()
+    self.test_permute()
+
+class TestGetContraction(unittest.TestCase):
+  def test_contraction(self):
+    r = get_contraction((1,2,3,4), (2,3,4))
+    self.assertEqual(r, [[0, 1], [2], [3]])
+
+    r = get_contraction((2,1,3,4), (2,3,4))
+    self.assertEqual(r, [[0], [1, 2], [3]])
+
+    r = get_contraction((1,2,3,1,4), (1,2,3,4))
+    self.assertEqual(r, [[0], [1], [2], [3, 4]])
+
+    r = get_contraction((1,2,3,1,4,1,1), (2,3,4))
+    self.assertEqual(r, [[0, 1], [2], [3, 4, 5, 6]])
+
+    r = get_contraction((1,2,3,4), (1,2,3*4))
+    self.assertEqual(r, [[0], [1], [2, 3]])
+
+    r = get_contraction((1,2,3,4), (2,1,3,4))
+    self.assertEqual(r, [[0, 1], [], [2], [3]])
+
+    r = get_contraction((1,2,3,4), (1,1,2*3*4,1))
+    self.assertEqual(r, [[0], [], [1,2,3], []])
+
+    r = get_contraction((2,1,3,4), (1,2,3,4))
+    self.assertEqual(r, [[], [0], [1, 2], [3]])
+
+    r = get_contraction((1,2,3,4), (2*3*4,1,1,1))
+    self.assertEqual(r, [[0, 1, 2, 3], [], [], []])
+
+    r = get_contraction((4,4,4,4), (16,1,16))
+    self.assertEqual(r, [[0, 1], [], [2, 3]])
+
+    r = get_contraction((1,2,3,4,1,1,1), (2,3,4))
+    self.assertEqual(r, [[0, 1], [2], [3, 4, 5, 6]])
+
+    r = get_contraction((1,2,3,4), (1,2,3,4,1))
+    self.assertEqual(r, [[0], [1], [2], [3], []])
+
+    r = get_contraction((14,1,384,14,1,1,1,1), (1,14,384,14))
+    self.assertEqual(r, [[], [0], [1,2], [3,4,5,6,7]])
+
+    r = get_contraction((14,1,384,1,14,1,1,1,1), (1,14,384,14))
+    self.assertEqual(r, [[], [0], [1,2], [3,4,5,6,7,8]])
+
+    r = get_contraction((512, 512), (1, 1, 512, 1, 1, 1, 1, 512))
+    self.assertEqual(r, [[], [], [0], [], [], [], [], [1]])
+
+    r = get_contraction((1,2,3,4), (1,2,6,2))
+    self.assertEqual(r, None)
+
+  def test_contraction_ones(self):
+    r = get_contraction((1,), (1,1,1))
+    self.assertEqual(r, [[0], [], []])
+
+    r = get_contraction((1,1), (1,1,1))
+    self.assertEqual(r, [[0], [1], []])
+
+    r = get_contraction((1,1,1,1), (1,))
+    self.assertEqual(r, [[0,1,2,3]])
+
+    r = get_contraction((1,1,1,1), (1,1))
+    self.assertEqual(r, [[0], [1,2,3]])
+
+    r = get_contraction((1,1,1,1), (1,1,1))
+    self.assertEqual(r, [[0], [1], [2,3]])
+
+    r = get_contraction((1,1,1,1), (1,1,1,1))
+    self.assertEqual(r, [[0], [1], [2], [3]])
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tinygrad_repo/test/unit/test_shm_tensor.py
+++ b/tinygrad_repo/test/unit/test_shm_tensor.py
@@ -0,0 +1,39 @@
+import unittest
+import multiprocessing.shared_memory as shared_memory
+from tinygrad.helpers import CI
+from tinygrad.runtime.ops_shm import RawShmBuffer
+from tinygrad.tensor import Tensor, Device
+import numpy as np
+
+class TestRawShmBuffer(unittest.TestCase):
+  def test_e2e(self):
+    t = Tensor.randn(2, 2, 2).realize()
+
+    # copy to shm
+    shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name
+    s.close()
+    t_shm = t.to(f"shm:{shm_name}").realize()
+
+    # copy from shm
+    t2 = t_shm.to(Device.DEFAULT).realize()
+
+    assert np.allclose(t.numpy(), t2.numpy())
+    s.unlink()
+
+  @unittest.skipIf(CI, "CI doesn't like big shared memory")
+  def test_e2e_big(self):
+    t = Tensor.randn(2048, 2048, 8).realize()
+
+    # copy to shm
+    shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name
+    s.close()
+    t_shm = t.to(f"shm:{shm_name}").realize()
+
+    # copy from shm
+    t2 = t_shm.to(Device.DEFAULT).realize()
+
+    assert np.allclose(t.numpy(), t2.numpy())
+    s.unlink()
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad_repo/test/unit/test_symbolic.py
+++ b/tinygrad_repo/test/unit/test_symbolic.py
@@ -0,0 +1,448 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.shape.symbolic import Node, MulNode, SumNode, Variable, NumNode, LtNode, sym_render, sym_infer, create_rednode
+
+class TestSymbolic(unittest.TestCase):
+  def helper_test_variable(self, v, n, m, s):
+    self.assertEqual(v.render(), s)
+    self.assertEqual(v.min, n)
+    self.assertEqual(v.max, m)
+
+  def test_ge(self):
+    self.helper_test_variable(Variable("a", 3, 8)>=77, 0, 0, "0")
+    self.helper_test_variable(Variable("a", 3, 8)>=9, 0, 0, "0")
+    self.helper_test_variable(Variable("a", 3, 8)>=8, 0, 1, "((a*-1)<-7)")
+    self.helper_test_variable(Variable("a", 3, 8)>=4, 0, 1, "((a*-1)<-3)")
+    self.helper_test_variable(Variable("a", 3, 8)>=3, 1, 1, "1")
+    self.helper_test_variable(Variable("a", 3, 8)>=2, 1, 1, "1")
+
+  def test_lt(self):
+    self.helper_test_variable(Variable("a", 3, 8)<77, 1, 1, "1")
+    self.helper_test_variable(Variable("a", 3, 8)<9, 1, 1, "1")
+    self.helper_test_variable(Variable("a", 3, 8)<8, 0, 1, "(a<8)")
+    self.helper_test_variable(Variable("a", 3, 8)<4, 0, 1, "(a<4)")
+    self.helper_test_variable(Variable("a", 3, 8)<3, 0, 0, "0")
+    self.helper_test_variable(Variable("a", 3, 8)<2, 0, 0, "0")
+
+  def test_ge_divides(self):
+    expr = (Variable("idx", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512
+    self.helper_test_variable(expr, 0, 1, "(idx<128)")
+
+  def test_ge_divides_and(self):
+    expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512,
+                          (Variable("idx2", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512])
+    self.helper_test_variable(expr, 0, 1, "((idx1<128) and (idx2<128))")
+    expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512,
+                          (Variable("idx2", 0, 511)*4 + Variable("FLOAT8_INDEX", 0, 7)) < 512])
+    self.helper_test_variable(expr//4, 0, 1, "((((FLOAT8_INDEX//4)+idx2)<128) and ((idx1//4)<32))")
+
+  def test_lt_factors(self):
+    expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 256)) < 512])
+    self.helper_test_variable(expr, 0, 1, "(((idx1*4)+FLOAT4_INDEX)<512)")
+
+  def test_div_becomes_num(self):
+    assert isinstance(Variable("a", 2, 3)//2, NumNode)
+
+  def test_var_becomes_num(self):
+    assert isinstance(Variable("a", 2, 2), NumNode)
+
+  def test_equality(self):
+    idx1 = Variable("idx1", 0, 3)
+    idx2 = Variable("idx2", 0, 3)
+    assert idx1 == idx1
+    assert idx1 != idx2
+    assert idx1*4 == idx1*4
+    assert idx1*4 != idx1*3
+    assert idx1*4 != idx1+4
+    assert idx1*4 != idx2*4
+    assert idx1+idx2 == idx1+idx2
+    assert idx1+idx2 == idx2+idx1
+    assert idx1+idx2 != idx2
+
+  def test_factorize(self):
+    a = Variable("a", 0, 8)
+    self.helper_test_variable(a*2+a*3, 0, 8*5, "(a*5)")
+
+  def test_factorize_no_mul(self):
+    a = Variable("a", 0, 8)
+    self.helper_test_variable(a+a*3, 0, 8*4, "(a*4)")
+
+  def test_neg(self):
+    self.helper_test_variable(-Variable("a", 0, 8), -8, 0, "(a*-1)")
+
+  def test_add_1(self):
+    self.helper_test_variable(Variable("a", 0, 8)+1, 1, 9, "(1+a)")
+
+  def test_add_num_1(self):
+    self.helper_test_variable(Variable("a", 0, 8)+Variable.num(1), 1, 9, "(1+a)")
+
+  def test_sub_1(self):
+    self.helper_test_variable(Variable("a", 0, 8)-1, -1, 7, "(-1+a)")
+
+  def test_sub_num_1(self):
+    self.helper_test_variable(Variable("a", 0, 8)-Variable.num(1), -1, 7, "(-1+a)")
+
+  def test_mul_0(self):
+    self.helper_test_variable(Variable("a", 0, 8)*0, 0, 0, "0")
+
+  def test_mul_1(self):
+    self.helper_test_variable(Variable("a", 0, 8)*1, 0, 8, "a")
+
+  def test_mul_neg_1(self):
+    self.helper_test_variable((Variable("a", 0, 2)*-1)//3, -1, 0, "((((a*-1)+3)//3)+-1)")
+
+  def test_mul_2(self):
+    self.helper_test_variable(Variable("a", 0, 8)*2, 0, 16, "(a*2)")
+
+  def test_div_1(self):
+    self.helper_test_variable(Variable("a", 0, 8)//1, 0, 8, "a")
+
+  def test_mod_1(self):
+    self.helper_test_variable(Variable("a", 0, 8)%1, 0, 0, "0")
+
+  def test_add_min_max(self):
+    self.helper_test_variable(Variable("a", 0, 8) * 2 + 12, 12, 16+12, "((a*2)+12)")
+
+  def test_div_min_max(self):
+    self.helper_test_variable(Variable("a", 0, 7) // 2, 0, 3, "(a//2)")
+
+  def test_div_neg_min_max(self):
+    self.helper_test_variable(Variable("a", 0, 7) // -2, -3, 0, "((a//2)*-1)")
+
+  def test_sum_div_min_max(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7), Variable("b", 0, 3)]) // 2, 0, 5, "((a+b)//2)")
+
+  def test_sum_div_factor(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable("b", 0, 3)*4]) // 2, 0, 20, "((a*2)+(b*2))")
+
+  def test_sum_div_some_factor(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*5, Variable("b", 0, 3)*4]) // 2, 0, 23, "(((a*5)//2)+(b*2))")
+
+  def test_sum_div_some_partial_factor(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*6, Variable("b", 0, 7)*6]) // 16, 0, 5, "(((a*3)+(b*3))//8)")
+    self.helper_test_variable(Variable.sum([Variable.num(16), Variable("a", 0, 7)*6, Variable("b", 0, 7)*6]) // 16, 1, 6, "((((a*3)+(b*3))//8)+1)")
+
+  def test_sum_div_no_factor(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*5, Variable("b", 0, 3)*5]) // 2, 0, 25, "(((a*5)+(b*5))//2)")
+
+  def test_mod_factor(self):
+    # NOTE: even though the mod max is 50, it can't know this without knowing about the mul
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*100, Variable("b", 0, 3)*50]) % 100, 0, 99, "((b*50)%100)")
+
+  def test_mod_to_sub(self):
+    # This is mod reduction
+    self.helper_test_variable((1+Variable("a",1,2))%2, 0, 1, (Variable("a",1,2)-1).render())
+
+  def test_sum_div_const(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable.num(3)]) // 4, 0, 7, "a")
+
+  def test_sum_div_const_big(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable.num(3)]) // 16, 0, 1, "(a//4)")
+
+  def test_sum_lt_fold(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7) * 4, Variable("b", 0, 3)]) < 16, 0, 1, "(a<4)")
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7) * 4, Variable("b", 0, 4)]) < 16, 0, 1, "(((a*4)+b)<16)")
+
+  def test_mod_mul(self):
+    self.helper_test_variable((Variable("a", 0, 5)*10)%9, 0, 5, "a")
+
+  def test_mod_mod(self):
+    self.helper_test_variable((Variable("a", 0, 31)%12)%4, 0, 3, "(a%4)")
+    self.helper_test_variable(((4*Variable("a", 0, 31)) % 12) % 4, 0, 0, "0")
+    self.helper_test_variable((Variable("a", 0, 31) % 4) % 12, 0, 3, "(a%4)")
+
+  def test_mul_mul(self):
+    self.helper_test_variable((Variable("a", 0, 5)*10)*9, 0, 5*10*9, "(a*90)")
+
+  def test_mul_lt(self):
+    self.helper_test_variable((Variable("a", 0, 5)*4)<13, 0, 1, "(a<4)")
+    self.helper_test_variable((Variable("a", 0, 5)*4)<16, 0, 1, "(a<4)")
+    self.helper_test_variable((Variable("a", 0, 5)*4)>11, 0, 1, "((a*-1)<-2)")
+    self.helper_test_variable((Variable("a", 0, 5)*4)>12, 0, 1, "((a*-1)<-3)")
+
+  def test_div_div(self):
+    self.helper_test_variable((Variable("a", 0, 1800)//10)//9, 0, 20, "(a//90)")
+
+  def test_distribute_mul(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 3), Variable("b", 0, 5)])*3, 0, 24, "((a*3)+(b*3))")
+
+  def test_mod_mul_sum(self):
+    self.helper_test_variable(Variable.sum([Variable("b", 0, 2), Variable("a", 0, 5)*10])%9, 0, 7, "(a+b)")
+
+  def test_sum_0(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 7)]), 0, 7, "a")
+
+  def test_mod_remove(self):
+    self.helper_test_variable(Variable("a", 0, 6)%100, 0, 6, "a")
+
+  def test_big_mod(self):
+    # NOTE: we no longer support negative variables
+    #self.helper_test_variable(Variable("a", -20, 20)%10, -9, 9, "(a%10)")
+    #self.helper_test_variable(Variable("a", -20, 0)%10, -9, 0, "(a%10)")
+    #self.helper_test_variable(Variable("a", -20, 1)%10, -9, 1, "(a%10)")
+    self.helper_test_variable(Variable("a", 0, 20)%10, 0, 9, "(a%10)")
+    #self.helper_test_variable(Variable("a", -1, 20)%10, -1, 9, "(a%10)")
+
+  def test_gt_remove(self):
+    self.helper_test_variable(Variable("a", 0, 6) >= 25, 0, 0, "0")
+
+  def test_lt_remove(self):
+    self.helper_test_variable(Variable("a", 0, 6) < -3, 0, 0, "0")
+    self.helper_test_variable(Variable("a", 0, 6) < 3, 0, 1, "(a<3)")
+    self.helper_test_variable(Variable("a", 0, 6) < 8, 1, 1, "1")
+
+  def test_lt_sum_remove(self):
+    self.helper_test_variable((Variable("a", 0, 6) + 2) < 3, 0, 1, "(a<1)")
+
+  def test_and_fold(self):
+    self.helper_test_variable(Variable.ands([Variable.num(0), Variable("a", 0, 1)]), 0, 0, "0")
+
+  def test_and_remove(self):
+    self.helper_test_variable(Variable.ands([Variable.num(1), Variable("a", 0, 1)]), 0, 1, "a")
+
+  def test_mod_factor_negative(self):
+    self.helper_test_variable(Variable.sum([Variable.num(-29), Variable("a", 0, 10), Variable("b", 0, 10)*28]) % 28, 0, 27, "((27+a)%28)")
+    self.helper_test_variable(Variable.sum([Variable.num(-29), Variable("a", 0, 100), Variable("b", 0, 10)*28]) % 28, 0, 27, "((27+a)%28)")
+
+  def test_sum_combine_num(self):
+    self.helper_test_variable(Variable.sum([Variable.num(29), Variable("a", 0, 10), Variable.num(-23)]), 6, 16, "(6+a)")
+
+  def test_sum_num_hoisted_and_factors_cancel_out(self):
+    self.helper_test_variable(Variable.sum([Variable("a", 0, 1) * -4 + 1, Variable("a", 0, 1) * 4]), 1, 1, "1")
+
+  def test_div_factor(self):
+    self.helper_test_variable(Variable.sum([Variable.num(-40), Variable("a", 0, 10)*2, Variable("b", 0, 10)*40]) // 40, -1, 9, "(-1+b)")
+
+  def test_mul_div(self):
+    self.helper_test_variable((Variable("a", 0, 10)*4)//4, 0, 10, "a")
+
+  def test_mul_div_factor_mul(self):
+    self.helper_test_variable((Variable("a", 0, 10)*8)//4, 0, 20, "(a*2)")
+
+  def test_mul_div_factor_div(self):
+    self.helper_test_variable((Variable("a", 0, 10)*4)//8, 0, 5, "(a//2)")
+
+  def test_div_remove(self):
+    self.helper_test_variable(Variable.sum([Variable("idx0", 0, 127)*4, Variable("idx2", 0, 3)])//4, 0, 127, "idx0")
+
+  def test_div_numerator_negative(self):
+    self.helper_test_variable((Variable("idx", 0, 9)*-10)//11, -9, 0, "((((idx*-10)+99)//11)+-9)")
+
+  def test_div_into_mod(self):
+    self.helper_test_variable((Variable("idx", 0, 16)*4)%8//4, 0, 1, "(idx%2)")
+
+class TestSymbolicNumeric(unittest.TestCase):
+  def helper_test_numeric(self, f):
+    # TODO: why are the negative tests broken? (even if we did support negative variables)
+    #MIN, MAX = -10, 10
+    MIN, MAX = 0, 10
+    # one number
+    for i in range(MIN, MAX):
+      v = f(Variable.num(i))
+      #print(i, f(i), v.min, v.max)
+      self.assertEqual(v.min, v.max)
+      self.assertEqual(v.min, f(i))
+    for kmin in range(MIN, MAX):
+      for kmax in range(MIN, MAX):
+        if kmin > kmax: continue
+        v = f(Variable("tmp", kmin, kmax))
+        values = [f(rv) for rv in range(kmin, kmax+1)]
+        # the min and max may not be exact
+        self.assertLessEqual(v.min, min(values))
+        self.assertGreaterEqual(v.max, max(values))
+
+  def test_mod_4(self): self.helper_test_numeric(lambda x: (x%4))
+  def test_div_4(self): self.helper_test_numeric(lambda x: (x//4))
+  def test_plus_1_div_2(self): self.helper_test_numeric(lambda x: (x+1)//2)
+  def test_plus_1_mod_2(self): self.helper_test_numeric(lambda x: (x+1)%2)
+  def test_times_2(self): self.helper_test_numeric(lambda x: x*2)
+  def test_times_2_plus_3(self): self.helper_test_numeric(lambda x: x*2 + 3)
+  def test_times_2_plus_3_mod_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)%4)
+  def test_times_2_plus_3_div_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)//4)
+  def test_times_2_plus_3_div_4_mod_4(self): self.helper_test_numeric(lambda x: ((x*2 + 3)//4)%4)
+
+class TestSymbolicVars(unittest.TestCase):
+  def test_simple(self):
+    z = NumNode(0)
+    a = Variable("a", 0, 10)
+    b = Variable("b", 0, 10)
+    c = Variable("c", 0, 10)
+    assert z.vars() == z.vars() == []
+    assert a.vars() == a.vars() == [a]
+    m = MulNode(a, 3)
+    assert m.vars() == [a]
+    s = SumNode([a, b, c])
+    assert s.vars() == [a, b, c]
+
+  def test_compound(self):
+    a = Variable("a", 0, 10)
+    b = Variable("b", 0, 10)
+    c = Variable("c", 0, 10)
+    assert (a + b * c).vars() == [a, b, c]
+    assert (a % 3 + b // 5).vars() == [a, b]
+    assert (a + b + c - a).vars() == [b, c]
+
+class TestSymbolicMinMax(unittest.TestCase):
+  def test_min_max_known(self):
+    a = Variable("a", 1, 8)
+    assert max(1, a) == max(a, 1) == a
+    assert min(1, a) == min(a, 1) == 1
+
+class TestSymRender(unittest.TestCase):
+  def test_sym_render(self):
+    a = Variable("a", 1, 8)
+    b = Variable("b", 1, 10)
+    assert sym_render(a) == "a"
+    assert sym_render(1) == "1"
+    assert sym_render(a+1) == "(1+a)"
+    assert sym_render(a*b) == "(a*b)"
+
+class TestSymInfer(unittest.TestCase):
+  def test_sym_infer(self):
+    a = Variable("a", 0, 10)
+    b = Variable("b", 0, 10)
+    c = Variable("c", 0, 10)
+    var_vals = {a: 2, b: 3, c: 4}
+    assert sym_infer(5, var_vals) == 5
+    assert sym_infer(a, var_vals) == 2
+    assert sym_infer(b, var_vals) == 3
+    assert sym_infer(a+b, var_vals) == 5
+    assert sym_infer(a-b, var_vals) == -1
+    assert sym_infer(a+b+c, var_vals) == 9
+    assert sym_infer(a*b, var_vals) == 6
+    assert sym_infer(a*b+c, var_vals) == 10
+
+class TestSymbolicSymbolicOps(unittest.TestCase):
+  def test_node_divmod_node(self):
+    i = Variable("i", 1, 10)
+    idx0 = Variable("idx0", 0, i*3-1)
+    assert NumNode(0) // (Variable("i", 1, 10)*128) == 0
+    assert NumNode(0) % (Variable("i", 1, 10)*128) == 0
+    assert NumNode(127) // (Variable("i", 1, 10)*128) == 0
+    assert NumNode(127) % (Variable("i", 1, 10)*128) == 127
+    assert 127 // (Variable("i", 1, 10)*128) == 0
+    assert 127 % (Variable("i", 1, 10)*128) == 127
+    assert NumNode(128) // (Variable("i", 1, 10)*128 + 128) == 0
+    assert NumNode(128) % (Variable("i", 1, 10)*128 + 128) == 128
+    assert 128 // (Variable("i", 1, 10)*128 + 128) == 0
+    assert 128 % (Variable("i", 1, 10)*128 + 128) == 128
+    assert 0 // (Variable("i", 1, 10)*128) == 0
+    assert 0 % (Variable("i", 1, 10)*128) == 0
+    assert idx0 // (i*3) == 0
+    assert idx0 % (i*3) == idx0
+    assert i // i == 1
+    assert i % i == 0
+    assert 128 // NumNode(4) == 32
+    assert 128 % NumNode(4) == 0
+    assert NumNode(128) // NumNode(4) == 32
+    assert NumNode(128) % NumNode(4) == 0
+
+  def test_mulnode_divmod_node(self):
+    i = Variable("i", 1, 10)
+    idx0 = Variable("idx0", 0, 31)
+    assert (idx0*(i*4+4)) // (i+1) == (idx0*4)
+    assert (idx0*(i*4+4)) % (i+1) == 0
+    assert (idx0*i) % i == 0
+
+  def test_sumnode_divmod_sumnode(self):
+    i = Variable("i", 1, 10)
+    idx0 = Variable("idx0", 0, 7)
+    idx1 = Variable("idx1", 0, 3)
+    idx2 = Variable("idx2", 0, i)
+    assert (idx0*(i*4+4)+idx1*(i+1)+idx2) // (i+1) == idx0*4+idx1
+    assert (idx0*(i*4+4)+idx1*(i+1)+idx2) % (i+1) == idx2
+    assert (i+1) // (i*128+128) == 0
+    assert (i+1) % (i*128+128) == (i+1)
+    assert (i+1+idx2) // (i+1) == 1
+    assert (i+1+idx2) % (i+1) == idx2
+    assert (idx0*(i*4+4)+i+1+idx2) // (i+1) == idx0*4+1
+    assert (idx0*(i*4+4)+i+1+idx2) % (i+1) == idx2
+    assert (i*128+128)*2 // (i*128+128) == 2
+    assert (i*128+128)*2 % (i*128+128) == 0
+
+  def test_sumnode_divmod_sumnode_complex(self):
+    i = Variable("i", 1, 1024)
+    gidx0 = Variable("gidx0", 0, i)
+    lidx1 = Variable("lidx1", 0, 7)
+    ridx2 = Variable("ridx1", 0, 31)
+    assert ((i*128+128)*2 + gidx0*128 + lidx1*(i*512+512) + ridx2*4) // (i*128+128) == 2 + lidx1*4
+    assert ((i*128+128)*2 + gidx0*128 + lidx1*(i*512+512) + ridx2*4) % (i*128+128) == gidx0*128 + ridx2*4
+    assert ((gidx0*128+i*128+ridx2*4+129)) // (i*128+128) == 1
+    assert ((gidx0*128+i*128+ridx2*4+129)) % (i*128+128) == gidx0*128 + ridx2*4 + 1
+    assert (ridx2*(i*4+4)+1+i+gidx0) // (i*128+128) == 0
+    assert (ridx2*(i*4+4)+1+i+gidx0) % (i*128+128) == (ridx2*(i*4+4)+1+i+gidx0)
+
+  def test_node_lt_node(self):
+    a = Variable("a", 1, 5)
+    b = Variable("b", 6, 9)
+    c = Variable("c", 1, 10)
+    d = Variable("d", 5, 10)
+    # if the value is always the same, it folds to num
+    assert (a < b) == 1
+    assert (b < a) == 0
+    assert (d < a) == 0
+    # if it remains as a LtNode, bool is always true and (min, max) == (0, 1)
+    assert isinstance((a < c), LtNode) and (a < c).min == 0 and (a < c).max == 1
+    assert a < c
+    assert isinstance((a > c), LtNode) and (a > c).min == 0 and (a > c).max == 1
+    # same when comparing with a constant
+    assert a < 3 and (a < 3).min == 0 and (a < 3).max == 1
+    assert a > 3 and (a > 3).min == 0 and (a > 3).max == 1
+
+  def test_num_node_mul_node(self):
+    a = Variable("a", 1, 5)
+    b = NumNode(2) * a
+    assert b == a * 2
+    assert isinstance(b, MulNode)
+    b = NumNode(1) * a
+    assert b == a
+    assert isinstance(b, Variable)
+    b = NumNode(0) * a
+    assert b == 0
+    assert isinstance(b, NumNode)
+
+  def test_num_node_expand(self):
+    a = NumNode(42)
+    assert a.expand() == [a]
+
+  def test_variable_expand(self):
+    a = Variable("a", 5, 7)
+    assert a.expand() == [a]
+
+  def test_variable_expand_expr_none(self):
+    a = Variable(None, 5, 7)
+    assert a.expand() == [NumNode(5), NumNode(6), NumNode(7)]
+
+  def test_mul_node_expand(self):
+    a = Variable(None, 5, 7)
+    m = MulNode(a, 3)
+    assert m.expand() == [NumNode(15), NumNode(18), NumNode(21)]
+
+    b = Variable("b", 1, 3)
+    n = MulNode(b, 3)
+    assert n.expand() == [Variable("b", 1, 3)*3]
+
+  def test_sum_node_expand(self):
+    a = Variable(None, 1, 3)
+    b = Variable("b", 5, 7)
+
+    s1 = create_rednode(SumNode, [a, b])
+    assert s1.expand() == [Variable.sum([NumNode(i),b]) for i in range(1,4)]
+
+  def test_multi_expand(self):
+    a = Variable("a", 1, 3)
+    b = Variable("b", 14, 17)
+    s1 = create_rednode(SumNode, [a, b])
+    # expand increments earlier variables faster than later variables (as specified in the argument)
+    # this behavior was just copied from before, no idea why this should be true
+    assert s1.expand((a, b)) == [NumNode(x + y) for x in range(b.min, b.max + 1) for y in range(a.min, a.max + 1)]
+
+  def test_substitute(self):
+    a = Variable(None, 1, 3)
+    b = a + 1
+    c = b.substitute({a: NumNode(1)})
+    assert c == NumNode(2)
+
+
+if __name__ == '__main__':
+  unittest.main()