Add openpilot tests

This commit is contained in:
FrogAi
2024-03-06 14:58:47 -07:00
parent 2901597132
commit b39097a12d
259 changed files with 31176 additions and 12 deletions

View File

@@ -0,0 +1,62 @@
from extra import dist
from tinygrad.jit import TinyJit
if __name__ == "__main__":
dist.preinit()
from extra.dist import collectives
from tinygrad.helpers import CI, getenv
from tinygrad.tensor import Tensor
import numpy as np
@TinyJit
def allreduce_jit(t:Tensor, cache_id=None) -> Tensor:
return collectives.allreduce(t, cache_id=cache_id).realize()
SIZE = 2048 if not CI else 2
SIZE_2 = 255 if not CI else 3
def run():
# set a deterministic seed so that both ranks generate the same random tensor
Tensor.manual_seed(42)
rank = getenv("RANK")
# loop 3 times to make sure it works with the jit
for _ in range(3):
# create a tensor to send
t = Tensor.zeros(SIZE, SIZE) if rank != 0 else Tensor.ones(SIZE, SIZE)
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test")
assert np.allclose(np.ones((SIZE, SIZE)), t2.numpy()), f"{t2.numpy()} wasn't ones"
# reset jit
allreduce_jit.cnt = 0
allreduce_jit.input_replace = {}
# test uneven chunk sizes
for _ in range(3):
# create a tensor to send
t = Tensor.ones(SIZE_2, SIZE_2, SIZE_2) if rank == 0 else Tensor.zeros(SIZE_2, SIZE_2, SIZE_2)
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test2")
assert np.allclose(np.ones((SIZE_2, SIZE_2, SIZE_2)), t2.numpy()), f"{t2.numpy()} wasn't ones"
print(f"rank {rank} passed")
if __name__ == "__main__":
if getenv("HIP"):
from tinygrad.runtime.ops_hip import HIP
devices = [f"hip:{i}" for i in range(HIP.device_count)]
else:
from tinygrad.runtime.ops_gpu import CL
devices = [f"gpu:{i}" for i in range(len(CL.devices))] if not CI else ["gpu:0", "gpu:0"]
world_size = len(devices)
dist.init_oob(world_size)
processes = []
for rank, device in enumerate(devices):
processes.append(dist.spawn(rank, device, fn=run, args=()))
for p in processes: p.join()
# exit with error code if any of the processes failed
for p in processes:
if p.exitcode != 0: exit(p.exitcode)

View File

@@ -0,0 +1,68 @@
from extra import dist
from tinygrad.jit import TinyJit
if __name__ == "__main__":
dist.preinit()
from extra.dist import world
from tinygrad.helpers import CI, getenv
from tinygrad.tensor import Tensor
import numpy as np
@TinyJit
def send_jit(t, target_rank, cache_id=None) -> Tensor:
return world.send(t, target_rank, cache_id=cache_id).realize()
@TinyJit
def recv_jit(t, target_rank, cache_id=None) -> Tensor:
return world.recv(t, target_rank, cache_id=cache_id).realize()
SIZE = 2048 if not CI else 2
def run():
# set a deterministic seed so that both ranks generate the same random tensor
Tensor.manual_seed(42)
rank = getenv("RANK")
# loop 3 times to make sure it works with the jit
for _ in range(3):
# create a tensor to send
t = Tensor.randn(SIZE, SIZE)
# send to rank 1
if rank == 0:
send_jit(t, 1, cache_id="test")
elif rank == 1:
t2 = Tensor.empty(SIZE, SIZE)
recv_jit(t2, 0, cache_id="test")
# recv from rank 1
if rank == 0:
t2 = Tensor.empty(SIZE, SIZE)
recv_jit(t2, 1, cache_id="test2")
elif rank == 1:
send_jit(t2, 0, cache_id="test2")
# check that the received tensor is the same as the sent tensor
if rank == 0:
assert np.allclose(t.numpy(), t2.numpy()), f"{t2.numpy()} wasn't equal to {t.numpy()}"
print(f"rank {rank} passed")
if __name__ == "__main__":
if getenv("HIP"):
devices = ["hip:0", "hip:1"]
else:
devices = ["gpu:0", "gpu:1" if not CI else "gpu:0"]
world_size = len(devices)
dist.init_oob(world_size)
processes = []
for rank, device in enumerate(devices):
processes.append(dist.spawn(rank, device, fn=run, args=()))
for p in processes: p.join()
# exit with error code if any of the processes failed
for p in processes:
if p.exitcode != 0: exit(p.exitcode)

View File

@@ -0,0 +1,27 @@
import unittest
from tinygrad.helpers import prod
from tinygrad.ops import Device
from tinygrad.tensor import Tensor
from tinygrad.helpers import GlobalCounters
from tinygrad.jit import CacheCollector
class TestCopy(unittest.TestCase):
def test_add1(self):
pts = []
for i in range(16384, 16384*256, 16384):
t = Tensor.randn(i).realize()
CacheCollector.start()
t.assign(t+1).realize()
fxn, args, _ = CacheCollector.finish()[0]
GlobalCounters.reset()
def run(): return fxn(args, force_wait=True)
ct = min([run() for _ in range(10)])
mb = prod(t.shape)*t.dtype.itemsize*2*1e-6
print(f"{mb*1e3:.2f} kB, {ct*1e3:.2f} ms, {mb/ct:.2f} MB/s")
pts.append((mb, mb/ct))
from matplotlib import pyplot as plt
plt.plot([x[0] for x in pts], [x[1] for x in pts])
plt.show()
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,102 @@
from lm_eval.base import BaseLM
from lm_eval import evaluator, tasks
import torch, json, argparse
from examples.llama import LLaMa
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
class LLaMaAdaptor(BaseLM):
def __init__(
self,
model_size="7B",
model_gen=1,
device="",
quantize=False,
batch_size=1,
max_batch_size=1,
do_sample=False,
temperature=1.0,
checkpoint_path="",
tokenizer_path="",
):
super().__init__()
if batch_size is None:
batch_size = 1
self.do_sample = do_sample
self.temperature = temperature
self._device = device
assert isinstance(model_gen, int)
assert isinstance(model_size, str)
assert isinstance(batch_size, int)
assert isinstance(checkpoint_path, str)
assert isinstance(tokenizer_path, str)
self.llama = LLaMa.build(checkpoint_path, tokenizer_path, model_gen, model_size, quantize)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")}
return cls(**kwargs, **additional_config)
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.llama.tokenizer.eos_id()
@property
def max_length(self):
return 1024
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
return 1
@property
def device(self):
return self._device
def tok_encode(self, string: str):
return [self.llama.tokenizer.bos_id()] + self.llama.tokenizer.encode(string)
def tok_decode(self, tokens):
return self.llama.tokenizer.decode(tokens)
def _model_call(self, inps):
Tensor.no_grad = True
return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy())
def greedy_until(self, requests):
continuations = []
for request in requests:
prompt, until = request[0], request[1]['until']
output = self.llama.greedy_until(prompt, until, max_length=128, temperature=0.0)
continuations.append(output[len(prompt):])
return continuations
def _model_generate(self, context, max_length, eos_token_id):
raise NotImplementedError()
if __name__ == '__main__':
print(f"using {Device.DEFAULT} backend")
parser = argparse.ArgumentParser(description='Run LLaMA evals in tinygrad', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--size', type=str, default="7B", help="Size of model to use [7B, 13B, 30B, 65B] for Gen 1, [7B, 13B] for Gen 2")
parser.add_argument('--gen', type=int, default="1", help="Generation of the model to use [1, 2]")
parser.add_argument('--quantize', action='store_true', help="Quantize the weights to int8 in memory")
parser.add_argument('--eval', type=str, default="arc_easy", help="Run in evaluation mode")
parser.add_argument('--limit', type=int, default=None, help="Limit tests in eval")
parser.add_argument('--weights', type=str, default="./weights/LLaMa/", help="Location of the weights")
parser.add_argument('--tokenizer', type=str, default="./weights/LLaMa/tokenizer.model", help="Location of the tokenizer")
args = parser.parse_args()
# run eval and exit
adaptor = LLaMaAdaptor(model_gen=args.gen, model_size=args.size, quantize=args.quantize, checkpoint_path=args.weights, tokenizer_path=args.tokenizer, device="cpu")
results = evaluator.evaluate(adaptor, tasks.get_task_dict(args.eval.split(",")), False, 0, args.limit)
print(json.dumps(results, indent=2))

View File

@@ -0,0 +1,128 @@
import csv, pathlib, time, numpy as np
from os import getenv
import torch
torch.set_num_threads(1)
import onnx
from onnx.helper import tensor_dtype_to_np_dtype
import onnxruntime as ort
from onnx2torch import convert
from extra.utils import download_file
from extra.onnx import get_run_onnx
from tinygrad.helpers import OSX, DEBUG
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
MODELS = {
"resnet50": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
"openpilot": "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx",
"efficientnet": "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx",
"shufflenet": "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx",
"commavq": "https://huggingface.co/commaai/commavq-gpt2m/resolve/main/gpt2m.onnx",
# broken in torch MPS
#"zfnet": "https://github.com/onnx/models/raw/main/vision/classification/zfnet-512/model/zfnet512-9.onnx",
# TypeError: BatchNormalization() got an unexpected keyword argument 'is_test'
#"densenet": "https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-3.onnx",
# AssertionError: only onnx version >= 10 supported for slice
#"bert": "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-8.onnx",
# really slow
#"resnet18": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx",
}
CSV = {}
open_csv = None
torch.manual_seed(1)
def benchmark(mnm, nm, fxn):
tms = []
for _ in range(3):
st = time.perf_counter_ns()
ret = fxn()
tms.append(time.perf_counter_ns() - st)
print(f"{mnm:15s} {nm:25s} {min(tms)*1e-6:7.2f} ms")
CSV[nm] = min(tms)*1e-6
return min(tms), ret
#BASE = pathlib.Path(__file__).parents[2] / "weights" / "onnx"
BASE = pathlib.Path("/tmp/onnx")
def benchmark_model(m, validate_outs=False):
global open_csv, CSV
CSV = {"model": m}
fn = BASE / MODELS[m].split("/")[-1]
download_file(MODELS[m], fn)
onnx_model = onnx.load(fn)
output_names = [out.name for out in onnx_model.graph.output]
excluded = {inp.name for inp in onnx_model.graph.initializer}
input_shapes = {inp.name:tuple(x.dim_value if x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded}
input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input if inp.name not in excluded}
#input_types = {k:v if v!=np.float16 else np.float32 for k,v in input_types.items()} # cast
np_inputs = {k:torch.randn(shp).numpy().astype(input_types[k]) for k,shp in input_shapes.items()}
assert len(input_shapes) < 30, f"too many input shapes {len(input_shapes)}"
# print input names
if DEBUG >= 2: print([inp.name for inp in onnx_model.graph.input if inp.name not in excluded])
for device in ["METAL" if OSX else "GPU", "CLANG"]: # + (["CUDA"] if torch.cuda.is_available() else []):
Device.DEFAULT = device
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = get_run_onnx(onnx_model)
benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
from tinygrad.jit import TinyJit
tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()})
del inputs, tinygrad_model, tinygrad_jitted_model
try:
torch_model = convert(onnx_model)
torch_inputs = [torch.tensor(x) for x in np_inputs.values()]
benchmark(m, "torch_cpu", lambda: torch_model(*torch_inputs))
torch_device = "mps" if OSX else "cuda"
torch_mps_model = torch_model.to(torch_device)
torch_mps_inputs = [x.to(torch_device) for x in torch_inputs]
benchmark(m, f"torch_{torch_device}", lambda: torch_mps_model(*torch_mps_inputs))
except Exception as e: print(f"{m:16s}onnx2torch {type(e).__name__:>25}")
# bench onnxruntime
ort_options = ort.SessionOptions()
ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
ort_options.log_severity_level = 3 # no warnings
for backend in ["CPU", "CUDA" if not OSX else "CoreML"]: # https://onnxruntime.ai/docs/execution-providers/
provider = backend+"ExecutionProvider"
if provider not in ort.get_available_providers(): continue
ort_sess = ort.InferenceSession(str(fn), ort_options, [provider])
benchmark(m, f"onnxruntime_{backend.lower()}", lambda: ort_sess.run(output_names, np_inputs))
del ort_sess
if validate_outs:
rtol, atol = 2e-3, 2e-3 # tolerance for fp16 models
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = get_run_onnx(onnx_model)
tinygrad_out = tinygrad_model(inputs)
ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
onnx_out = ort_sess.run(output_names, np_inputs)
onnx_out = dict([*[(name,x) for name, x in zip(output_names, onnx_out)]])
assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
print(f"{m:16s}outputs validated with rtol={rtol:.1e}, atol={atol:.1e}")
if open_csv is None:
open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))
open_csv.writeheader()
open_csv.writerow(CSV)
def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5):
assert len(tiny_out) == len(onnx_out) and tiny_out.keys() == onnx_out.keys()
for k in tiny_out.keys():
tiny_v, onnx_v = tiny_out[k], onnx_out[k]
if tiny_v is None: assert tiny_v == onnx_v
else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}")
if __name__ == "__main__":
if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), True)
else:
for m in MODELS: benchmark_model(m, True)

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python3
# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored
from tinygrad.helpers import Timing
from tinygrad.runtime.ops_gpu import CL
# TODO: support multidevice in cuda
device = 'gpu'
if __name__ == "__main__":
sz = 1024*1024*256 # 1 GB
#sz = 1024*64
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
c0 = Tensor.ones(sz, device="cpu").realize()
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a0 = c0.to(f'{device}:0').realize()
CL.synchronize()
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b1 = c1.to(f'{device}:1').realize()
CL.synchronize()
# cross copy. this is going through the CPU
with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a1 = a0.to(f'{device}:1').realize()
CL.synchronize()
with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b0 = b1.to(f'{device}:0').realize()
CL.synchronize()
# sum
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab0 = (a0 + b0).realize()
CL.synchronize()
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab1 = (a1 + b1).realize()
CL.synchronize()
# cross device sum (does this work?)
# is this making a copy first? is that copy through the CPU?
# the slowness comes from the *blocking* clprg call, is this pyopencl?
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx0 = (a0 + b1).realize()
CL.synchronize()
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx1 = (b1 + a0).realize()
CL.synchronize()
# copy back
# NOTE: half of this slowness is caused by allocating memory on the CPU
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc0 = ab0.numpy()
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc1 = ab1.numpy()
# same
print("testing")
np.testing.assert_allclose(cc0, cc1)
# devices
print(ab0)
print(ab1)
print(abx0)
print(abx1)

View File

@@ -0,0 +1,41 @@
from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
from tinygrad.helpers import dtypes
import time
N = 1000000
a = CLBuffer(N, dtypes.float32)
b = CLBuffer(N, dtypes.float32)
c = CLBuffer(N, dtypes.float32)
prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
int idx = get_global_id(0);
a[idx] = b[idx] + c[idx];
}""")
prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
t1 = time.monotonic_ns()
e1 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
CL.synchronize()
t2 = time.monotonic_ns()
time.sleep(3)
t3 = time.monotonic_ns()
e2 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
CL.synchronize()
t4 = time.monotonic_ns()
print(e1.profile.queued)
print(e1.profile.submit)
print(e1.profile.start)
print(e1.profile.end)
print(e1, e2)
print(t2-t1, e1.profile.end - e1.profile.start)
print(t4-t3, e2.profile.end - e2.profile.start)
print(t3-t2, e2.profile.queued-e1.profile.end)
print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
print("ratio since boot", t1/e1.profile.start)
print(e1.profile.start)
print(e1.profile.end)
print(e2.profile.start)
print(e2.profile.end)

View File

@@ -0,0 +1,125 @@
#!/usr/bin/env python
import unittest, gc
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.nn.state import get_state_dict
from tinygrad.helpers import GlobalCounters
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
from tinygrad.helpers import dtypes, prod
from tinygrad.ops import Device
from test.helpers import derandomize_model
from examples.llama import Transformer
ALLOCATED_DEV_BUFS = 0
class FakeDeviceBuffer:
def __init__(self, sz, dt, device):
self.id = 1
self.size = sz
self.dtype = dt
self.device = device
global ALLOCATED_DEV_BUFS
ALLOCATED_DEV_BUFS += 1
class FakeAllocator(LRUAllocator):
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
def _do_free(self, buf):
buf.id -= 1
assert buf.id == 0, f"Free should be called once, but {buf.id}"
def __del__(self): # Fake allocator should clear all buffers after each test.
for v in self.cached_buffers.values():
for buf, _ in v: self._free_buffer(buf)
FAKE_GLOBAL_ALLOCATOR = None
class FakeBuffer(RawBuffer):
def __init__(self, size, dtype, device='0'):
global FAKE_GLOBAL_ALLOCATOR
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
class FakeProgram:
def __init__(self, name:str, prg:str): pass
def __call__(self, *bufs, global_size, local_size, wait=False): pass
def helper_test_correctness(gen, train):
from tinygrad.runtime.ops_gpu import CL, CLAllocator
old_alloc = CL.cl_allocator
CL.cl_allocator = CLAllocator(0)
no_alloc_result = train(*gen()).numpy()
Device[Device.DEFAULT].synchronize()
CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
for _ in range(4):
GlobalCounters.reset()
np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
Device[Device.DEFAULT].synchronize()
assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
CL.cl_allocator = old_alloc
def __helper_test_alloc_count(gen, train):
was_alloc = ALLOCATED_DEV_BUFS
for _ in range(2):
train(*gen())
return ALLOCATED_DEV_BUFS - was_alloc
def helper_test_alloc_count(mm, gen, train):
global FAKE_GLOBAL_ALLOCATOR
backup_program = Device[Device.DEFAULT].runtime
backup_buffer = Device[Device.DEFAULT].buffer
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].buffer = FakeBuffer
Device[Device.DEFAULT].method_cache.clear()
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
new_allocs = __helper_test_alloc_count(gen, train)
Device[Device.DEFAULT].method_cache.clear()
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
old_allocs = __helper_test_alloc_count(gen, train)
print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
assert new_allocs < old_allocs, f"Hmm, doesn't cache work any more?"
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].buffer = backup_buffer
FAKE_GLOBAL_ALLOCATOR = None
def check_gc():
if Device.DEFAULT == "GPU":
gc.collect() # Need to collect Tensors.
from extra.introspection import print_objects
assert print_objects() == 0
class TestAllocators(unittest.TestCase):
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
def test_lru_allocator_tiny_llama(self):
old_type = Tensor.default_type
Tensor.default_type = dtypes.float16
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
def __test():
model = Transformer(**args_tiny)
derandomize_model(model)
def test(t): return model(t, 0).realize()
helper_test_correctness(lambda: (Tensor([[1,]]),), test)
__test()
Tensor.default_type = old_type
check_gc()
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
def test_lru_allocator_tiny_llama_alloc_counts(self):
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
def test_alloc_count(t):
model = Transformer(**args_tiny)
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
return model(t, 0).realize()
helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
check_gc()
@unittest.skip("huge for CI")
def test_stable_diffusion(self):
from examples.stable_diffusion import UNetModel
model = UNetModel()
derandomize_model(model)
def test(t, t2): return model(t, 801, t2).realize()
helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,8 @@
from tinygrad.tensor import Tensor
from tinygrad.nn import Embedding
if __name__ == "__main__":
vocab_size = 50257
dim = 128
test = Embedding(vocab_size, dim)
ret = test(Tensor([[1,2,3]])).numpy()

View File

@@ -0,0 +1,208 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps
from tinygrad.shape.shapetracker import ShapeTracker, View, ZeroView
from tinygrad.runtime.ops_gpu import GPUBuffer, CLProgram, CLCodegen
#from tinygrad.runtime.ops_metal import MetalBuffer as GPUBuffer, MetalProgram as CLProgram, MetalCodegen as CLCodegen
from tinygrad.helpers import getenv
from extra.lib_test_ast import test_ast
import platform
OSX = platform.system() == "Darwin"
def compile_and_test_ast(ast, local_size=None):
k = CLCodegen(ast)
prg = k.codegen().build(CLProgram)
if local_size is not None: prg.local_size = local_size
for i in range(5): prg(prg.lower(k.bufs))
if getenv("TEST", 0): test_ast(k)
class TestAST(unittest.TestCase):
def test_conv_zeroview_ast(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (2, 2, 2, 1), -3), ZeroView((1, 1, 1, 2), ((0, 1), (0, 1), (-1, 2), (-1, 3))), View((1, 1, 3, 4), (0, 0, 4, 1), 0)]))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (0, 0, 0, 0), 0)]))
op1 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
ast = LazyOp(UnaryOps.RELU, (op1,), None)
compile_and_test_ast(ast)
def test_cifar_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 64, 34, 34), (65536, 1024, 32, 1), -33), ZeroView((512, 64, 32, 32), ((0, 512), (0, 64), (-1, 33), (-1, 33))), View((512, 1, 128, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 1, 128, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 64, 3, 3), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
ast = LazyOp(ReduceOps.SUM, (op0,), (512, 1, 128, 32, 32, 1, 1, 1))
compile_and_test_ast(ast)
def test_cifar_conv_backward(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 512, 10, 10), (64, 16384, 8, 1), -9), ZeroView((256, 512, 8, 8), ((0, 256), (0, 512), (-1, 9), (-1, 9))), View((256, 1, 512, 3, 3, 512, 8, 8), (51200, 51200, 0, 10, 1, 100, 10, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 256, 8, 8), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 1, 512, 3, 3, 512, 8, 8), (0, 0, 64, 0, 0, 32768, 8, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 512, 8, 8), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (256, 1, 512, 3, 3, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (256, 512, 3, 3))
compile_and_test_ast(ast)
def test_first_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 130, 258, 1, 12), (393216, 3072, 12, 12, 1), -3084), ZeroView((1, 128, 256, 1, 12), ((0, 1), (-1, 129), (-1, 257), (0, 1), (0, 12))), View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 6192, 24, 0, 0, 3096, 12, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 768, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 0, 0, 432, 4, 144, 16, 48, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 108, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
compile_and_test_ast(ast)
def test_second_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 66, 130, 32, 1), (262144, 4096, 32, 1, 1), -4128), ZeroView((1, 64, 128, 32, 1), ((0, 1), (-1, 65), (-1, 129), (0, 32), (0, 1))), View((1, 64, 128, 8, 4, 1, 1, 3, 3), (266240, 4160, 32, 4, 1, 12480, 12480, 4160, 32), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 64, 128, 8, 4, 1, 1, 3, 3), (0, 0, 0, 36, 1, 0, 0, 12, 4), 0)]), hostbuf=GPUBuffer(shape=(8, 9, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
compile_and_test_ast(ast)
def test_third_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4))
compile_and_test_ast(ast)
# VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op
# 164 time 27.75 ms running re_S128_4 with [128] None count 4 runtime 1016.06 us 2.07 GFLOPS () -> (128, 1)
# 169 time 22.51 ms running matmul with [4, 16, 128] [4, 16, 16] count 5 runtime 110.08 us 19.06 GFLOPS ('-DMATMUL',) -> (128, 1)
def test_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 128, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 128, 4, 1, 1, 1, 1), (512, 512, 512, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
compile_and_test_ast(ast)
def test_alt_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
compile_and_test_ast(ast)
# re_S32_16_36_6 is fast
def test_1x1_36_6(self): # 36 <- 6
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 1536, 24, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 0, 0, 0, 0, 96, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(36, 24, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(144,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op6 = LazyOp(BinaryOps.MAX, (op5,buf5,), None)
op7 = LazyOp(BinaryOps.SUB, (op3,op6,), None)
ast = LazyOp(MovementOps.RESHAPE, (op7,), (32, 2304, 4))
compile_and_test_ast(ast, None if OSX else (16, 16, 4))
# re_S32_16_6_36 is slow
def test_1x1_6_36(self): # 6 <- 36
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 9216, 144, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 2304, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 0, 0, 0, 0, 576, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 144, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 1536, 24, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
op3 = LazyOp(BinaryOps.ADD, (op2,buf3,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (32, 384, 4))
compile_and_test_ast(ast, (6, 16, 4))
# re_S32_16_6_24
def test_1x1_6_24(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 6144, 96, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 1536, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 0, 0, 0, 0, 384, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 96, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
#buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
#op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 384, 4))
compile_and_test_ast(ast, (6, 4, 8))
def test_full_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (512, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op0 = LazyOp(BinaryOps.ADD, (buf0,buf1,), None)
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([2.], dtype=np.float32)))
op1 = LazyOp(BinaryOps.POW, (op0,buf2,), None)
op2 = LazyOp(ReduceOps.SUM, (op1,), (1, 1))
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([0.5], dtype=np.float32)))
op3 = LazyOp(BinaryOps.POW, (op2,buf3,), None)
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
op4 = LazyOp(BinaryOps.SUB, (op3,buf4,), None)
op5 = LazyOp(UnaryOps.RELU, (op4,), None)
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
op6 = LazyOp(BinaryOps.ADD, (op5,buf5,), None)
buf6 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([3.4e+38], dtype=np.float32)))
op7 = LazyOp(BinaryOps.SUB, (op3,buf6,), None)
op8 = LazyOp(UnaryOps.RELU, (op7,), None)
op9 = LazyOp(BinaryOps.SUB, (op6,op8,), None)
op10 = LazyOp(UnaryOps.RECIPROCAL, (op9,), None)
ast = LazyOp(MovementOps.RESHAPE, (op10,), (1, 1))
compile_and_test_ast(ast)
def test_1239_reduce(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 64, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 1024, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(1239, 256,
4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1239, 4, 1, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 1, 1, 1, 4956))
compile_and_test_ast(ast)
def test_enet_first_conv_bs32(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 1, 32, 112, 112, 3, 3, 3), (0, 0, 27, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 3, 3), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (8, 1, 32, 112, 112, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (8, 32, 112, 112))
compile_and_test_ast(ast)
def test_enet_reduce_bs32(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 1, 32, 3, 3, 32, 112, 112), (0, 12845056, 401408, 0, 0, 12544, 112, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 32, 1, 1, 32, 112, 112), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (3, 1, 32, 3, 3, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (3, 32, 3, 3))
compile_and_test_ast(ast)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python
import os
import unittest
import numpy as np
if 'IMAGE' not in os.environ:
os.environ['IMAGE'] = '2'
os.environ['GPU'] = '1'
os.environ['OPT'] = '2'
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d
Tensor.no_grad = True
class TestImage(unittest.TestCase):
def test_create_image(self):
t = Tensor.ones(128, 128, 1)
t = t.reshape(128, 32, 4) + 3
t.realize()
np.testing.assert_array_equal(t.numpy(), np.ones((128,32,4))*4)
def test_sum_image(self):
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
t1.realize()
t1 = t1.sum()
t1.realize()
assert t1.numpy() == 16*4*4*4, f"got {t1.numpy()}"
def test_add_image(self):
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
t2 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 4
t1.realize()
t2.realize()
t3 = t1 + t2
t3.realize()
np.testing.assert_array_equal(t3.numpy(), np.ones((16,4,4))*9)
def test_padded_conv(self):
bs, in_chans, out_chans = 1,12,32
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
tiny_dat = Tensor.ones(bs, 12, 64, 128)
tiny_conv(tiny_dat).realize()
def test_op_conv(self):
bs, in_chans, out_chans = 1,12,32
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
tiny_dconv = Conv2d(out_chans, out_chans, 1, bias=None, padding=0)
tiny_dat = Tensor.ones(bs, 12, 64, 128)
p2 = tiny_conv(tiny_dat).relu()
p2 = tiny_dconv(p2)
p2.realize()
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
from tinygrad.helpers import dtypes, CI
from tinygrad.ops import Device
from test.helpers import derandomize_model
from examples.llama import Transformer
def helper_test_jitted_correctness(gen, train, train_jit):
nojit = train(*gen()).numpy()
for _ in range(5): jit = train_jit(*gen()).numpy()
np.testing.assert_allclose(nojit, jit, rtol=1e-3, atol=1e-5)
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE, "needs JIT")
class TestJittedModels(unittest.TestCase):
def test_jitted_tiny_llama(self):
old_type = Tensor.default_type
Tensor.default_type = dtypes.float16
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
model = Transformer(**args_tiny)
derandomize_model(model)
def test(t): return model(t, 0).realize()
@TinyJit
def test_jit(t): return model(t, 0).realize()
helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
Tensor.default_type = old_type
@unittest.skipUnless(not CI, "huge for CI")
def test_jitted_stable_diffusion(self):
from examples.stable_diffusion import UNetModel
model = UNetModel()
derandomize_model(model)
def test(t, t2): return model(t, 801, t2).realize()
@TinyJit
def test_jit(t, t2): return model(t, 801, t2).realize()
helper_test_jitted_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test, test_jit)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,208 @@
import unittest
from onnx.backend.base import Backend, BackendRep
import onnx.backend.test
import numpy as np
from tinygrad.tensor import Tensor
from typing import Any, Tuple
from tinygrad.helpers import getenv, CI
# pip3 install tabulate
pytest_plugins = 'onnx.backend.test.report',
from extra.onnx import get_run_onnx
class TinygradModel(BackendRep):
def __init__(self, run_onnx, input_names):
super().__init__()
self.fxn = run_onnx
self.input_names = input_names
def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
real_inputs = {k:v for k,v in zip(self.input_names, inputs)}
ret = self.fxn(real_inputs, debug=True)
return tuple(x.numpy() if isinstance(x, Tensor) else [i.numpy() for i in x] if isinstance(x, list) else np.array(x) for x in ret.values())
class TinygradBackend(Backend):
@classmethod
def prepare(cls, model, device):
input_all = [x.name for x in model.graph.input]
input_initializer = [x.name for x in model.graph.initializer]
net_feed_input = [x for x in input_all if x not in input_initializer]
print("prepare", cls, device, net_feed_input)
run_onnx = get_run_onnx(model)
return TinygradModel(run_onnx, net_feed_input)
@classmethod
def supports_device(cls, device: str) -> bool:
return device == "CPU"
backend_test = onnx.backend.test.BackendTest(TinygradBackend, __name__)
# no support for reduce with multiply (needs llop)
backend_test.exclude('test_reduce_prod_*')
# TODO figure out why it's returning wrong values, geohotstan's uneducated guess is it's due to imprecision from float64 (double) -> float32
# see Type Constraints: https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adam.html#type-constraints
backend_test.exclude('test_adam_multiple_cpu')
backend_test.exclude('test_nesterov_momentum_cpu')
# we only support float32
backend_test.exclude('uint8')
backend_test.exclude('uint16')
backend_test.exclude('uint32')
backend_test.exclude('uint64')
backend_test.exclude('int8')
backend_test.exclude('int16')
backend_test.exclude('float64')
backend_test.exclude('string')
backend_test.exclude('test_pow_types_int*')
backend_test.exclude('test_cast_*')
backend_test.exclude('test_castlike_*')
backend_test.exclude('test_convinteger_*')
backend_test.exclude('test_matmulinteger_*')
backend_test.exclude('test_reduce_log_sum_exp*') # dependent on actual float64 implementation for backends
backend_test.exclude('test_operator_add*') # dependent on float64 math. Without it values default to 0 or inf
# we don't support indexes
# backend_test.exclude('test_argmax_*') # Needs more work: select_last_index
# backend_test.exclude('test_argmin_*') # Needs more work: select_last_index
backend_test.exclude('test_nonzero_*')
# no support for mod
backend_test.exclude('test_mod_*')
# no boolean ops (2d, 3d, 4d)
backend_test.exclude('test_bitshift_*')
# no scatternd gathernd
backend_test.exclude('test_gathernd_*')
backend_test.exclude('test_scatternd_*')
# no quantize
backend_test.exclude('test_dynamicquantizelinear_*')
backend_test.exclude('test_qlinearmatmul_*')
backend_test.exclude('test_qlinearconv_*')
backend_test.exclude('test_quantizelinear_*')
# no rnn
backend_test.exclude('test_gru_*')
backend_test.exclude('test_rnn_*')
backend_test.exclude('test_lstm_*')
backend_test.exclude('test_simple_rnn_*')
# no control flow
backend_test.exclude('test_if_*')
backend_test.exclude('test_loop*')
backend_test.exclude('test_range_float_type_positive_delta_expanded_cpu') # requires loop
# unsupported (strange) ops
backend_test.exclude('test_bitwise_*')
backend_test.exclude('test_blackmanwindow_*')
backend_test.exclude('test_bernoulli_*')
backend_test.exclude('test_cumsum_*')
backend_test.exclude('test_det_*')
backend_test.exclude('test_tril_zero_cpu') # TODO: zero array support
backend_test.exclude('test_triu_zero_cpu') # TODO: zero array support
backend_test.exclude('test_col2im_*')
backend_test.exclude('test_hammingwindow_*')
backend_test.exclude('test_hannwindow_*')
backend_test.exclude('test_hardmax_*')
backend_test.exclude('test_gridsample_*')
backend_test.exclude('test_dft_*')
backend_test.exclude('test_einsum_*')
backend_test.exclude('test_strnorm_*')
backend_test.exclude('test_unique_*')
backend_test.exclude('test_sequence_*')
backend_test.exclude('test_nonmaxsuppression_*')
backend_test.exclude('test_reversesequence_*')
backend_test.exclude('test_roialign_*')
backend_test.exclude('test_top_k_*')
backend_test.exclude('test_tfidfvectorizer_*')
backend_test.exclude('test_stft_*')
backend_test.exclude('test_melweightmatrix_*')
# more strange ops
backend_test.exclude('test_basic_deform_conv_*')
backend_test.exclude('test_deform_conv_*')
backend_test.exclude('test_lppool_*')
backend_test.exclude('test_depthtospace_*')
backend_test.exclude('test_spacetodepth_*')
backend_test.exclude('test_scan*')
backend_test.exclude('test_split_to_sequence_*')
backend_test.exclude('test_resize_downsample_scales_cubic_*') # unsure how to implement cubic
backend_test.exclude('test_resize_downsample_sizes_cubic_*') # unsure how to implement cubic
backend_test.exclude('test_resize_upsample_scales_cubic_*') # unsure how to implement cubic
backend_test.exclude('test_resize_upsample_sizes_cubic_*') # unsure how to implement cubic
# rest of the failing tests
backend_test.exclude('test_averagepool_2d_dilations_cpu') # dilations != 1 not supported for avgpool
backend_test.exclude('test_convtranspose_autopad_same_cpu') # TODO geohotstan has no idea how this is done, autopad requires output_shape but output_shape requires pads from autopad
backend_test.exclude('test_optional_has_element_empty_optional_input_cpu') # Attempts to create Tensor from None
backend_test.exclude('test_range_int32_type_negative_delta_expanded_cpu') # AttributeProto.GRAPH not implemented
backend_test.exclude('test_reshape_allowzero_reordered_cpu') # reshaping to 0 shape
backend_test.exclude('test_resize_downsample_scales_linear_antialias_cpu') # antialias not implemented
backend_test.exclude('test_resize_downsample_sizes_linear_antialias_cpu') # antialias not implemented
backend_test.exclude('test_resize_tf_crop_and_resize_cpu') # unsure about fill value after clip
backend_test.exclude('test_operator_addconstant_cpu') # bad data type
# issue 1556 https://github.com/tinygrad/tinygrad/issues/1556
backend_test.exclude('test_isinf_cpu')
backend_test.exclude('test_isinf_negative_cpu')
backend_test.exclude('test_isinf_positive_cpu')
backend_test.exclude('test_isnan_cpu')
# issue 1791 fast math messes with these https://github.com/tinygrad/tinygrad/issues/1791
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
# issue 2067 potentially also a fastmath issue https://github.com/tinygrad/tinygrad/issues/2067
if getenv('METAL'):
backend_test.exclude('test_maxpool_2d_pads_cpu')
backend_test.exclude('test_maxpool_2d_same_lower_cpu')
# Don't know how to treat special TensorProto like TensorProto.FLOAT8E4M3FN
if getenv("CPU") or getenv("TORCH"):
backend_test.exclude('test_dequantizelinear_axis_cpu')
backend_test.exclude('test_dequantizelinear_cpu')
# compiled backends cannot reshape to and from 0
if getenv('LLVM') or getenv('GPU') or getenv('CLANG') or getenv('METAL') or getenv('CUDA'):
backend_test.exclude('test_slice_start_out_of_bounds_cpu')
backend_test.exclude('test_constantofshape_int_shape_zero_cpu')
if getenv('GPU') or getenv('METAL'):
backend_test.exclude('test_mish_cpu') # weird inaccuracy
backend_test.exclude('test_mish_expanded_cpu') # weird inaccuracy
backend_test.exclude('test_eyelike_with_dtype_cpu') # backend does not support dtype: Double
# Segfaults in CI
if (getenv('LLVM') or getenv('CUDA')) and CI:
backend_test.exclude('test_max_float16_cpu')
backend_test.exclude('test_min_float16_cpu')
# disable model tests for now since they are slow
if not getenv("MODELTESTS"):
for x in backend_test.test_suite:
if 'OnnxBackendRealModelTest' in str(type(x)):
backend_test.exclude(str(x).split(" ")[0])
else:
# model tests all pass!
backend_test.include('test_resnet50')
backend_test.include('test_inception_v1')
backend_test.include('test_inception_v2')
backend_test.include('test_densenet121')
backend_test.include('test_shufflenet')
backend_test.include('test_squeezenet')
backend_test.include('test_bvlc_alexnet')
backend_test.include('test_zfnet512')
backend_test.include('test_vgg19')
globals().update(backend_test.enable_report().test_cases)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,392 @@
#!/usr/bin/env python
import os
import torch
if "OPT" not in os.environ:
os.environ["OPT"] = "2"
import gc
import numpy as np
import unittest
from tinygrad.tensor import Tensor, Device
from tinygrad import nn
from tinygrad.helpers import getenv
from tinygrad.nn import optim
from tinygrad.helpers import GlobalCounters
from tinygrad.lazy import PUSH_PERMUTES
from tinygrad.jit import CacheCollector
class CLCache:
def __init__(self, allowed=None, strict=False, preclear=True): self.allowed, self.strict, self.preclear = allowed, strict, preclear
def __enter__(self):
if self.preclear:
gc.collect()
for x in [x for x in gc.get_objects() if isinstance(x, Tensor)]:
x.realize()
GlobalCounters.reset()
CacheCollector.start()
print("cache: entering")
def __exit__(self, type, value, traceback):
cache = CacheCollector.finish()
print(f"cache: exiting with size {len(cache)}", f"allowed {self.allowed}" if self.allowed is not None else "")
if self.allowed is not None:
assert len(cache) <= self.allowed and (not self.strict or len(cache) == self.allowed), f"used too many kernels! {len(cache)} > {self.allowed}"
from models.convnext import ConvNeXt
from models.efficientnet import EfficientNet
from models.resnet import ResNet18
from models.vit import ViT
from tinygrad.nn.state import get_parameters
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestInferenceMinKernels(unittest.TestCase):
def setUp(self):
Tensor.training = False
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
def test_convnext(self):
model = ConvNeXt()
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(129):
model(img).realize()
def test_enet(self):
model = EfficientNet(getenv("ENET_NUM", 0), has_se=False)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(51):
model.forward(img).realize()
def test_enet_se(self):
model = EfficientNet(getenv("ENET_NUM", 0), has_se=True)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
# TODO: this seems very high
with CLCache(115):
model.forward(img).realize()
def test_resnet(self):
model = ResNet18()
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(26):
model.forward(img).realize()
def test_vit(self):
model = ViT(embed_dim=192, num_heads=3)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(222): # NOTE: this is way too high
out = model.forward(img)
assert len(CacheCollector.cache) == 0, "ViT prerealized?"
out.realize()
def test_llama(self):
from examples.llama import Transformer
args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
model = Transformer(**args_tiny)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
with CLCache(85):
model(Tensor([[1,2,3,4]]), 0).realize()
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOptBinOp(unittest.TestCase):
def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
c = f1(a, b)
if f2 is not None: d = f2(a, b)
c.realize()
if f2 is not None: d.realize()
assert len(CacheCollector.cache) == allowed, "binop was rerun!"
if f2 is not None: np.testing.assert_allclose(c.numpy().ravel(), d.numpy().ravel(), rtol=1e-3, atol=1e-5)
def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
def test_no_binop_rerun_reduce_broadcast(self): return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
@unittest.skip("this test started failing with the new change, based movementop issue")
def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
def test_no_binop_rerun_mid_reshape(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(256)+a.reshape(256))
# currently non working tests
#def test_no_binop_rerun_preshape(self): return self._test_no_binop_rerun(lambda a,b: a.reshape(16, 16, 1)*b.reshape(16, 16, 1), lambda a,b: a*b)
#def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
#def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOptReduceLoop(unittest.TestCase):
@unittest.skip("this is broken")
def test_loop_left(self):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
t = a.sum(0)
b = t.reshape(16,1).expand(16,16).sum(0)
c = (t+b)
c.realize()
assert len(CacheCollector.cache) == 2, "loop left fusion broken"
def test_loop_right(self):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
t = a.sum(0)
b = t.reshape(16,1).expand(16,16).sum(0)
c = (b+t)
c.realize()
assert len(CacheCollector.cache) == 2, "loop right fusion broken"
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOptWChild(unittest.TestCase):
def test_unrealized_child(self):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
c = (a*b).sum()
d = c+1
e = c+2
d.realize()
assert len(CacheCollector.cache) == 2, "don't fuse if you have children"
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOpt(unittest.TestCase):
def test_muladd(self):
a,b,c = [Tensor.ones(2,2) for _ in range(3)]
with CLCache():
d = a * b + c
d.realize()
assert len(CacheCollector.cache) == 1, "optimizer didn't fold muladd"
np.testing.assert_allclose(d.numpy(), np.ones((2,2))*2, rtol=1e-5)
def test_fold_reduce_elementwise(self):
img = Tensor.ones(32)
addme = Tensor.ones(1)
with CLCache():
ret = img.sum() + addme
ret.realize()
assert len(CacheCollector.cache) == 1, "optimizer didn't fold reduce/elementwise"
assert ret.numpy()[0] == 33
def test_fold_batchnorm(self):
with Tensor.train():
img = Tensor.ones(1,32,4,4)
bn = nn.BatchNorm2d(32, track_running_stats=False)
with CLCache():
img_bn = bn(img).realize()
print(img_bn)
assert len(CacheCollector.cache) == 3, f"optimizer didn't fold batchnorm, got {len(CacheCollector.cache)}"
# Tensor.training = False
def test_fold_conv_sgd(self):
with Tensor.train():
img = Tensor.ones(2,3,4,4)
c1 = nn.Conv2d(3,32,3)
opt = optim.SGD(get_parameters(c1))
with CLCache():
opt.zero_grad()
c1(img).relu().sum().backward()
opt.step()
# TODO: this should be 4, but the sum output child stays around
# with pushing_permutes it can be 3
# TODO: broken with optim fixes
assert len(CacheCollector.cache) in [4,5,6], f"optimizer didn't fold conv-backward SGD, got {len(CacheCollector.cache)}"
# Tensor.training = False
def test_fold_2convs_sgd(self):
with Tensor.train():
img = Tensor.ones(2,3,64,64)
c1 = nn.Conv2d(3,16,3,bias=False)
c2 = nn.Conv2d(16,32,3,bias=False)
opt = optim.SGD(get_parameters([c1, c2]))
with CLCache(allowed=9):
opt.zero_grad()
c2(c1(img).relu()).relu().sum().backward()
opt.step()
# Tensor.training = False
def test_fold_4convs_sgd(self):
with Tensor.train():
img = Tensor.ones(2,3,64,64)
c1 = nn.Conv2d(3,4,3,bias=False)
c2 = nn.Conv2d(4,8,3,bias=False)
c3 = nn.Conv2d(8,16,3,bias=False)
c4 = nn.Conv2d(16,32,3,bias=False)
opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
with CLCache(allowed=19):
opt.zero_grad()
c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
opt.step()
# Tensor.training = False
def test_fold_conv_batchnorm_sgd(self):
with Tensor.train():
img = Tensor.ones(1,3,4,4)
c1 = nn.Conv2d(3,32,3)
bn = nn.BatchNorm2d(32, track_running_stats=False)
opt = optim.SGD(get_parameters([c1, bn]))
with CLCache(allowed=18): # this is too high
img_bn = bn(c1(img)).elu().sum()
opt.zero_grad()
img_bn.backward()
opt.step()
# Tensor.training = False
def test_fold_conv_batchnorm_notrain(self):
img = Tensor.ones(1,3,8,8)
c1 = nn.Conv2d(3,32,3)
bn = nn.BatchNorm2d(32, track_running_stats=False)
# precache the bn
img_conv = bn(c1(img)).relu().realize()
with CLCache():
img_conv = bn(c1(img)).relu().realize()
assert len(CacheCollector.cache) == 1, f"optimizer didn't fold conv-batchnorm at test time, got {len(CacheCollector.cache)}"
def test_fold_conv_batchnorm(self):
with Tensor.train():
img = Tensor.ones(1,3,8,8)
c1 = nn.Conv2d(3,32,3)
bn = nn.BatchNorm2d(32, track_running_stats=False)
with CLCache():
img_conv = bn(c1(img)).relu().realize()
print(img_conv)
assert len(CacheCollector.cache) == 4, f"optimizer didn't fold conv-batchnorm, got {len(CacheCollector.cache)}"
def test_fold_conv_elu(self):
img = Tensor.ones(1,4,8,8)
c1 = nn.Conv2d(4, 4, kernel_size=3)
c2 = nn.Conv2d(4, 4, kernel_size=3)
with CLCache():
img_conv = img.sequential([c1, Tensor.elu, c2, Tensor.elu]).realize()
print(img_conv)
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/elu"
def test_fold_conv_relu(self):
img = Tensor.ones(1,4,8,8)
c1 = nn.Conv2d(4, 4, kernel_size=3)
c2 = nn.Conv2d(4, 4, kernel_size=3)
with CLCache():
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
print(img_conv)
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
def test_fold_conv_relu_nobias(self):
img = Tensor.ones(1,4,8,8)
c1 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
c2 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
with CLCache():
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
print(img_conv)
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
def test_permute_was_pushed(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2)
d = c.permute(1,0).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
def test_permute_was_pushed_through_contract_reshape(self):
a = Tensor.randn(4, 4, 4, 4, 4)
with CLCache():
c = a.sum(-1)
d = c.reshape(16,16).permute(1,0).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,16).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
def test_permute_was_pushed_through_contractw1s_reshape(self):
a = Tensor.randn(4, 4, 4, 4, 4)
with CLCache():
c = a.sum(-1)
d = c.reshape(16,1,16).permute(2,1,0).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,1,16).transpose(2,1,0), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
# TODO: push permute through expansion reshape
@unittest.skip("expansion can't push expand permute yet")
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
def test_permute_was_pushed_through_expand_reshape(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2)
d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0).reshape(4,4,4,4), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
def test_no_reduceop_rerun(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2)
d = a.sum(2).permute(1,0)
c.realize()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(c.numpy().transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
assert cache_len == 1, "reduceop was rerun!"
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
def test_no_reduceop_rerun_alt(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2).permute(1,0)
d = a.sum(2)
c.realize()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(c.numpy(), d.numpy().transpose(1,0), rtol=1e-3, atol=1e-5)
assert cache_len == 1, "reduceop was rerun!"
def test_fold_with_contiguous(self):
a = Tensor.randn(16, 16, 16)
b = Tensor.randn(16, 16)
with CLCache():
c = (a.sum(2).contiguous() + b).contiguous()
c.realize()
cache_len = len(CacheCollector.cache)
assert cache_len == 1, "contiguous wasn't folded"
def _test_fold_expand_reduce_helper(self, n, m, axis, allowed):
b = torch.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
with CLCache(allowed=allowed):
a = Tensor.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
a.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
return cache_len
def test_expand_reduce_is_folded_on_same_axis(self):
for axis in [0, 1]:
for n in [4, 8, 16]:
b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
with CLCache(allowed=2):
a = Tensor.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
a.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
return cache_len
def test_expand_reduce_is_not_folded_on_different_axes(self):
axis1, axis2 = 0, 1
for n in [4, 8, 16]:
b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
with CLCache(allowed=3):
a = Tensor.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
a.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
return cache_len
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python
import unittest
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tinygrad.tensor import Tensor
from tinygrad.nn.optim import LAMB
np.random.seed(1337)
x_init = np.random.randn(1,4).astype(np.float32)
W_init = np.random.randn(4,4).astype(np.float32)
m_init = np.random.randn(1,4).astype(np.float32)
class TinyNet:
def __init__(self):
self.x = Tensor(x_init.copy(), requires_grad=True)
self.W = Tensor(W_init.copy(), requires_grad=True)
self.m = Tensor(m_init.copy())
def forward(self):
out = self.x.matmul(self.W).relu()
out = out.log_softmax(1)
out = out.mul(self.m).add(self.m).sum()
return out
class TinyNetTF:
def __init__(self):
self.x = tf.Variable(x_init.copy(), trainable=True)
self.W = tf.Variable(W_init.copy(), trainable=True)
self.m = tf.constant(m_init.copy())
def forward(self):
out = tf.matmul(self.x, self.W)
out = tf.nn.relu(out)
out = tf.nn.log_softmax(out, axis=1)
out = tf.multiply(out, self.m) + self.m
out = tf.reduce_sum(out)
return out
def step(optim, steps=1, kwargs={}):
net = TinyNet()
optim = optim([net.x, net.W], **kwargs)
for _ in range(steps):
out = net.forward()
optim.zero_grad()
out.backward()
optim.step()
return net.x.detach().numpy(), net.W.detach().numpy()
def step_tf(optim, steps=1, kwargs={}):
net = TinyNetTF()
optim = optim(**kwargs)
for _ in range(steps):
with tf.GradientTape() as tape:
out = net.forward()
grads = tape.gradient(out, [net.x, net.W])
optim.apply_gradients(zip(grads, [net.x, net.W]))
return net.x.numpy(), net.W.numpy()
class ExternalTestOptim(unittest.TestCase):
def _test_optim(self, tinygrad_optim, tensorflow_optim, steps, opts, atol, rtol):
for x,y in zip(step(tinygrad_optim, steps, kwargs=opts),
step_tf(tensorflow_optim, steps, kwargs=opts)):
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
def _test_lamb(self, steps, opts, atol, rtol): self._test_optim(LAMB, tfa.optimizers.LAMB, steps, opts, atol, rtol)
def test_lamb(self): self._test_lamb(1, {'lr': 0.001}, 1e-5, 0)
def test_lamb_high_lr(self): self._test_lamb(1, {'lr': 10}, 1e-5, 1e-5)
def test_multistep_lamb(self): self._test_lamb(10, {'lr': 0.001}, 1e-5, 0)
def test_multistep_lamb_high_lr(self): self._test_lamb(10, {'lr': 10}, 1e-5, 3e-4)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,57 @@
# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
import unittest, time
import numpy as np
from examples.llama import Transformer, MODEL_PARAMS
from test.test_net_speed import start_profile, stop_profile
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
from tinygrad.nn.state import get_state_dict
from tinygrad.ops import Compiled
from tinygrad.helpers import dtypes, prod
from tinygrad.runtime.lib import RawBuffer
class FakeProgram:
def __init__(self, name:str, prg:str): pass
def __call__(self, *bufs, global_size, local_size, wait=False): pass
class RawFakeBuffer(RawBuffer):
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
class TestLLaMASpeed(unittest.TestCase):
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
def test_llama_compile(self):
backup_program = Device[Device.DEFAULT].runtime
backup_buffer = Device[Device.DEFAULT].buffer
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].buffer = RawFakeBuffer
print("testing llama python run time")
model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
print("built model")
# assign fake tensors to the values
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
print("assigned empty tensors, doing warmup")
def run_llama(st, empty_method_cache=True):
if empty_method_cache: Device[Device.DEFAULT].method_cache.clear()
tms = [time.perf_counter()]
for i in range(10):
model(Tensor([[2]]), i).realize()
tms.append(time.perf_counter())
timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
run_llama("codegen")
run_llama("methodcache", False)
pr = start_profile()
run_llama("profile")
stop_profile(pr, sort='time', frac=0.1)
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].buffer = backup_buffer
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python
import unittest
from tinygrad.tensor import Tensor
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.renderer.opencl import OpenCLRenderer
from tinygrad.graph import graph_uops
from tinygrad.nn import Conv2d
class TestUopsGraph(unittest.TestCase):
def test_matmul(self):
N = 1024
a = Tensor.rand(N,N)
b = Tensor.rand(N,N)
si = (a@b).lazydata.schedule()[-1]
lin = Linearizer(si.ast)
lin.hand_coded_optimizations()
print(lin.colored_shape())
uops = lin.linearize().uops
graph_uops(uops)
for u in uops: print(u)
print(OpenCLRenderer("matmul", uops)[0])
def test_reduce(self):
a = Tensor.rand(1024*1024)
si = a.sum().lazydata.schedule()[-1]
lin = Linearizer(si.ast)
lin.hand_coded_optimizations()
uops = lin.linearize().uops
graph_uops(uops)
#print(OpenCLRenderer("reduce", uops)[0])
def test_conv(self):
x = Tensor.rand(1,3,16,16)
c = Conv2d(3, 16, (3,3))
si = c(x).elu().lazydata.schedule()[-1]
lin = Linearizer(si.ast)
lin.hand_coded_optimizations()
uops = lin.linearize().uops
graph_uops(uops)
print(lin.colored_shape())
print(OpenCLRenderer("conv", uops)[0])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,36 @@
import io
import unittest
from pathlib import Path
import cv2
import requests # type: ignore
import numpy as np
from tinygrad.tensor import Tensor
from examples.yolov3 import Darknet, infer, show_labels
from extra.utils import fetch
chicken_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/Chicken.jpg'))
car_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/car.jpg'))
class TestYOLO(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"))
print("Loading weights file (237MB). This might take a while…")
cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
@classmethod
def tearDownClass(cls):
del cls.model
def test_chicken(self):
labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
self.assertEqual(labels, ["bird"])
def test_car(self):
labels = show_labels(infer(self.model, car_img))
self.assertEqual(labels, ["car"])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,76 @@
import numpy as np
from extra.utils import fetch, download_file, get_child
from examples.yolov8 import YOLOv8, get_variant_multiples, preprocess, postprocess, label_predictions
from pathlib import Path
import unittest
import io, cv2, os
import onnxruntime as ort
import ultralytics
from tinygrad.nn.state import safe_load, load_state_dict
class TestYOLOv8(unittest.TestCase):
def test_all_load_weights(self):
for variant in ['n', 's', 'm', 'l', 'x']:
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{variant}.safetensors', weights_location)
depth, width, ratio = get_variant_multiples(variant)
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
state_dict = safe_load(weights_location)
load_state_dict(TinyYolov8, state_dict)
print(f'successfully loaded weights for yolov{variant}')
def test_predictions(self):
test_image_urls = ['https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg', 'https://www.aljazeera.com/wp-content/uploads/2022/10/2022-04-28T192650Z_1186456067_UP1EI4S1I0P14_RTRMADP_3_SOCCER-ENGLAND-MUN-CHE-REPORT.jpg']
variant = 'n'
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
depth, width, ratio = get_variant_multiples(variant)
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
state_dict = safe_load(weights_location)
load_state_dict(TinyYolov8, state_dict)
for i in range(len(test_image_urls)):
img_stream = io.BytesIO(fetch(test_image_urls[i]))
img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
test_image = preprocess([img])
predictions = TinyYolov8(test_image)
post_predictions = postprocess(preds=predictions, img=test_image, orig_imgs=[img])
labels = label_predictions(post_predictions)
assert labels == {5: 1, 0: 4, 11: 1} if i == 0 else labels == {0: 13, 29: 1, 32: 1}
def test_forward_pass_torch_onnx(self):
variant = 'n'
weights_location_onnx = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.onnx'
weights_location_pt = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.pt'
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
download_file(f'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8{variant}.pt', weights_location_pt)
# the ultralytics export prints a lot of unneccesary things
if not weights_location_onnx.is_file():
model = ultralytics.YOLO(model=weights_location_pt, task='Detect')
model.export(format="onnx",imgsz=[640, 480])
depth, width, ratio = get_variant_multiples(variant)
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
state_dict = safe_load(weights_location)
load_state_dict(TinyYolov8, state_dict)
image_location = [np.frombuffer(io.BytesIO(fetch('https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg')).read(), np.uint8)]
orig_image = [cv2.imdecode(image_location[0], 1)]
input_image = preprocess(orig_image)
onnx_session = ort.InferenceSession(weights_location_onnx)
onnx_input_name = onnx_session.get_inputs()[0].name
onnx_output_name = onnx_session.get_outputs()[0].name
onnx_output = onnx_session.run([onnx_output_name], {onnx_input_name: input_image.numpy()})
tiny_output = TinyYolov8(input_image)
# currently rtol is 0.025 because there is a 1-2% difference in our predictions
# because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch.
# This difference does not make a difference "visually".
np.testing.assert_allclose(onnx_output[0], tiny_output.numpy(), atol=5e-4, rtol=0.025)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,61 @@
import random
from tinygrad.helpers import DEBUG
from test.unit.test_shapetracker import CheckingShapeTracker
random.seed(42)
def do_permute(st):
perm = list(range(0, len(st.shape)))
random.shuffle(perm)
perm = tuple(perm)
if DEBUG >= 1: print("st.permute(", perm, ")")
st.permute(perm)
def do_pad(st):
c = random.randint(0, len(st.shape)-1)
pad = tuple((random.randint(0,2), random.randint(0,2)) if i==c else (0,0) for i in range(len(st.shape)))
if DEBUG >= 1: print("st.pad(", pad, ")")
st.pad(pad)
def do_reshape_split_one(st):
c = random.randint(0, len(st.shape)-1)
poss = [n for n in [1,2,3,4,5] if st.shape[c]%n == 0]
spl = random.choice(poss)
shp = st.shape[0:c] + (st.shape[c]//spl, spl) + st.shape[c+1:]
if DEBUG >= 1: print("st.reshape(", shp, ")")
st.reshape(shp)
def do_reshape_combine_two(st):
if len(st.shape) < 2: return
c = random.randint(0, len(st.shape)-2)
shp = st.shape[:c] + (st.shape[c] * st.shape[c+1], ) + st.shape[c+2:]
if DEBUG >= 1: print("st.reshape(", shp, ")")
st.reshape(shp)
def do_shrink(st):
c = random.randint(0, len(st.shape)-1)
while 1:
shrink = tuple((random.randint(0,s), random.randint(0,s)) if i == c else (0,s) for i,s in enumerate(st.shape))
if all(x<y for (x,y) in shrink): break
if DEBUG >= 1: print("st.shrink(", shrink, ")")
st.shrink(shrink)
def do_stride(st):
c = random.randint(0, len(st.shape)-1)
stride = tuple(random.choice([-2,-1,2]) if i==c else 1 for i in range(len(st.shape)))
if DEBUG >= 1: print("st.stride(", stride, ")")
st.stride(stride)
def do_expand(st):
c = [i for i,s in enumerate(st.shape) if s==1]
if len(c) == 0: return
c = random.choice(c)
expand = tuple(random.choice([2,3,4]) if i==c else s for i,s in enumerate(st.shape))
if DEBUG >= 1: print("st.expand(", expand, ")")
st.expand(expand)
if __name__ == "__main__":
ops = [do_permute, do_pad, do_shrink, do_reshape_split_one, do_reshape_combine_two, do_stride, do_expand]
for _ in range(200):
st = CheckingShapeTracker((random.randint(2, 10), random.randint(2, 10), random.randint(2, 10)))
for i in range(8): random.choice(ops)(st)
st.assert_same()

View File

@@ -0,0 +1,69 @@
import itertools
import random
from tinygrad.helpers import DEBUG
from tinygrad.shape.symbolic import Variable
random.seed(42)
def add_v(expr, rng=None):
if rng is None: rng = random.randint(0,2)
return expr + v[rng], rng
def div(expr, rng=None):
if rng is None: rng = random.randint(1,9)
return expr // rng, rng
def mul(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr * rng, rng
def mod(expr, rng=None):
if rng is None: rng = random.randint(1,9)
return expr % rng, rng
def add_num(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr + rng, rng
def lt(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr < rng, rng
def ge(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr >= rng, rng
def le(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr <= rng, rng
def gt(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr > rng, rng
if __name__ == "__main__":
ops = [add_v, div, mul, add_num, mod]
for _ in range(1000):
upper_bounds = [*list(range(1, 10)), 16, 32, 64, 128, 256]
u1 = Variable("v1", 0, random.choice(upper_bounds))
u2 = Variable("v2", 0, random.choice(upper_bounds))
u3 = Variable("v3", 0, random.choice(upper_bounds))
v = [u1,u2,u3]
tape = [random.choice(ops) for _ in range(random.randint(2, 30))]
# 10% of the time, add one of lt, le, gt, ge
if random.random() < 0.1: tape.append(random.choice([lt, le, gt, ge]))
expr = Variable.num(0)
rngs = []
for t in tape:
expr, rng = t(expr)
if DEBUG >= 1: print(t.__name__, rng)
rngs.append(rng)
if DEBUG >=1: print(expr)
space = list(itertools.product(range(u1.min, u1.max+1), range(u2.min, u2.max+1), range(u3.min, u3.max+1)))
volume = len(space)
for (v1, v2, v3) in random.sample(space, min(100, volume)):
v = [v1,v2,v3]
rn = 0
for t,r in zip(tape, rngs): rn, _ = t(rn, r)
num = eval(expr.render())
assert num == rn, f"mismatched {expr.render()} at {v1=} {v2=} {v3=} = {num} != {rn}"
if DEBUG >= 1: print(f"matched {expr.render()} at {v1=} {v2=} {v3=} = {num} == {rn}")

View File

@@ -0,0 +1,61 @@
import unittest
from tinygrad.nn.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d, BatchNorm2d, optim
def model_step(lm):
with Tensor.train():
x = Tensor.ones(8,12,128,256, requires_grad=False)
optimizer = optim.SGD(get_parameters(lm), lr=0.001)
loss = lm.forward(x).sum()
optimizer.zero_grad()
loss.backward()
del x,loss
optimizer.step()
class TestBatchnorm(unittest.TestCase):
def test_conv(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
def forward(self, x):
return self.c(x).relu()
lm = LilModel()
model_step(lm)
def test_two_conv(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
def forward(self, x):
return self.c2(self.c(x)).relu()
lm = LilModel()
model_step(lm)
def test_two_conv_bn(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 24, 3, padding=1, bias=False)
self.bn = BatchNorm2d(24, track_running_stats=False)
self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
self.bn2 = BatchNorm2d(32, track_running_stats=False)
def forward(self, x):
x = self.bn(self.c(x)).relu()
return self.bn2(self.c2(x)).relu()
lm = LilModel()
model_step(lm)
def test_conv_bn(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
self.bn = BatchNorm2d(32, track_running_stats=False)
def forward(self, x):
return self.bn(self.c(x)).relu()
lm = LilModel()
model_step(lm)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,74 @@
import unittest
import numpy as np
from tinygrad.ops import Device
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv, CI
def multidevice_test(fxn):
exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
def ret(self):
for device in Device._buffers:
if device in ["DISK", "SHM", "FAKE"]: continue
if not CI: print(device)
if device in exclude_devices:
if not CI: print(f"WARNING: {device} test is excluded")
continue
with self.subTest(device=device):
try:
Device[device]
except Exception:
if not CI: print(f"WARNING: {device} test isn't running")
continue
fxn(self, device)
return ret
class TestExample(unittest.TestCase):
@multidevice_test
def test_convert_to_cpu(self, device):
a = Tensor([[1,2],[3,4]], device=device)
assert a.numpy().shape == (2,2)
b = a.cpu()
assert b.numpy().shape == (2,2)
@multidevice_test
def test_2_plus_3(self, device):
a = Tensor([2], device=device)
b = Tensor([3], device=device)
result = a + b
print(f"{a.numpy()} + {b.numpy()} = {result.numpy()}")
assert result.numpy()[0] == 5.
@multidevice_test
def test_example_readme(self, device):
x = Tensor.eye(3, device=device, requires_grad=True)
y = Tensor([[2.0,0,-2.0]], device=device, requires_grad=True)
z = y.matmul(x).sum()
z.backward()
x.grad.numpy() # dz/dx
y.grad.numpy() # dz/dy
assert x.grad.device == device
assert y.grad.device == device
@multidevice_test
def test_example_matmul(self, device):
try:
Device[device]
except Exception:
print(f"WARNING: {device} test isn't running")
return
x = Tensor.eye(64, device=device, requires_grad=True)
y = Tensor.eye(64, device=device, requires_grad=True)
z = y.matmul(x).sum()
z.backward()
x.grad.numpy() # dz/dx
y.grad.numpy() # dz/dy
assert x.grad.device == device
assert y.grad.device == device
if __name__ == '__main__':
unittest.main()