Add openpilot tests

This commit is contained in:
FrogAi
2024-03-06 14:58:47 -07:00
parent 2901597132
commit b39097a12d
259 changed files with 31176 additions and 12 deletions

View File

@@ -0,0 +1,12 @@
FROM ubuntu:20.04
# Install python3.8, and pip3
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.8 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
# Install python dependencies
COPY . ./tinygrad
WORKDIR tinygrad
RUN pip install -e .

View File

View File

@@ -0,0 +1,62 @@
from extra import dist
from tinygrad.jit import TinyJit
if __name__ == "__main__":
dist.preinit()
from extra.dist import collectives
from tinygrad.helpers import CI, getenv
from tinygrad.tensor import Tensor
import numpy as np
@TinyJit
def allreduce_jit(t:Tensor, cache_id=None) -> Tensor:
return collectives.allreduce(t, cache_id=cache_id).realize()
SIZE = 2048 if not CI else 2
SIZE_2 = 255 if not CI else 3
def run():
# set a deterministic seed so that both ranks generate the same random tensor
Tensor.manual_seed(42)
rank = getenv("RANK")
# loop 3 times to make sure it works with the jit
for _ in range(3):
# create a tensor to send
t = Tensor.zeros(SIZE, SIZE) if rank != 0 else Tensor.ones(SIZE, SIZE)
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test")
assert np.allclose(np.ones((SIZE, SIZE)), t2.numpy()), f"{t2.numpy()} wasn't ones"
# reset jit
allreduce_jit.cnt = 0
allreduce_jit.input_replace = {}
# test uneven chunk sizes
for _ in range(3):
# create a tensor to send
t = Tensor.ones(SIZE_2, SIZE_2, SIZE_2) if rank == 0 else Tensor.zeros(SIZE_2, SIZE_2, SIZE_2)
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test2")
assert np.allclose(np.ones((SIZE_2, SIZE_2, SIZE_2)), t2.numpy()), f"{t2.numpy()} wasn't ones"
print(f"rank {rank} passed")
if __name__ == "__main__":
if getenv("HIP"):
from tinygrad.runtime.ops_hip import HIP
devices = [f"hip:{i}" for i in range(HIP.device_count)]
else:
from tinygrad.runtime.ops_gpu import CL
devices = [f"gpu:{i}" for i in range(len(CL.devices))] if not CI else ["gpu:0", "gpu:0"]
world_size = len(devices)
dist.init_oob(world_size)
processes = []
for rank, device in enumerate(devices):
processes.append(dist.spawn(rank, device, fn=run, args=()))
for p in processes: p.join()
# exit with error code if any of the processes failed
for p in processes:
if p.exitcode != 0: exit(p.exitcode)

View File

@@ -0,0 +1,68 @@
from extra import dist
from tinygrad.jit import TinyJit
if __name__ == "__main__":
dist.preinit()
from extra.dist import world
from tinygrad.helpers import CI, getenv
from tinygrad.tensor import Tensor
import numpy as np
@TinyJit
def send_jit(t, target_rank, cache_id=None) -> Tensor:
return world.send(t, target_rank, cache_id=cache_id).realize()
@TinyJit
def recv_jit(t, target_rank, cache_id=None) -> Tensor:
return world.recv(t, target_rank, cache_id=cache_id).realize()
SIZE = 2048 if not CI else 2
def run():
# set a deterministic seed so that both ranks generate the same random tensor
Tensor.manual_seed(42)
rank = getenv("RANK")
# loop 3 times to make sure it works with the jit
for _ in range(3):
# create a tensor to send
t = Tensor.randn(SIZE, SIZE)
# send to rank 1
if rank == 0:
send_jit(t, 1, cache_id="test")
elif rank == 1:
t2 = Tensor.empty(SIZE, SIZE)
recv_jit(t2, 0, cache_id="test")
# recv from rank 1
if rank == 0:
t2 = Tensor.empty(SIZE, SIZE)
recv_jit(t2, 1, cache_id="test2")
elif rank == 1:
send_jit(t2, 0, cache_id="test2")
# check that the received tensor is the same as the sent tensor
if rank == 0:
assert np.allclose(t.numpy(), t2.numpy()), f"{t2.numpy()} wasn't equal to {t.numpy()}"
print(f"rank {rank} passed")
if __name__ == "__main__":
if getenv("HIP"):
devices = ["hip:0", "hip:1"]
else:
devices = ["gpu:0", "gpu:1" if not CI else "gpu:0"]
world_size = len(devices)
dist.init_oob(world_size)
processes = []
for rank, device in enumerate(devices):
processes.append(dist.spawn(rank, device, fn=run, args=()))
for p in processes: p.join()
# exit with error code if any of the processes failed
for p in processes:
if p.exitcode != 0: exit(p.exitcode)

View File

@@ -0,0 +1,27 @@
import unittest
from tinygrad.helpers import prod
from tinygrad.ops import Device
from tinygrad.tensor import Tensor
from tinygrad.helpers import GlobalCounters
from tinygrad.jit import CacheCollector
class TestCopy(unittest.TestCase):
def test_add1(self):
pts = []
for i in range(16384, 16384*256, 16384):
t = Tensor.randn(i).realize()
CacheCollector.start()
t.assign(t+1).realize()
fxn, args, _ = CacheCollector.finish()[0]
GlobalCounters.reset()
def run(): return fxn(args, force_wait=True)
ct = min([run() for _ in range(10)])
mb = prod(t.shape)*t.dtype.itemsize*2*1e-6
print(f"{mb*1e3:.2f} kB, {ct*1e3:.2f} ms, {mb/ct:.2f} MB/s")
pts.append((mb, mb/ct))
from matplotlib import pyplot as plt
plt.plot([x[0] for x in pts], [x[1] for x in pts])
plt.show()
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,102 @@
from lm_eval.base import BaseLM
from lm_eval import evaluator, tasks
import torch, json, argparse
from examples.llama import LLaMa
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
class LLaMaAdaptor(BaseLM):
def __init__(
self,
model_size="7B",
model_gen=1,
device="",
quantize=False,
batch_size=1,
max_batch_size=1,
do_sample=False,
temperature=1.0,
checkpoint_path="",
tokenizer_path="",
):
super().__init__()
if batch_size is None:
batch_size = 1
self.do_sample = do_sample
self.temperature = temperature
self._device = device
assert isinstance(model_gen, int)
assert isinstance(model_size, str)
assert isinstance(batch_size, int)
assert isinstance(checkpoint_path, str)
assert isinstance(tokenizer_path, str)
self.llama = LLaMa.build(checkpoint_path, tokenizer_path, model_gen, model_size, quantize)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")}
return cls(**kwargs, **additional_config)
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.llama.tokenizer.eos_id()
@property
def max_length(self):
return 1024
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
return 1
@property
def device(self):
return self._device
def tok_encode(self, string: str):
return [self.llama.tokenizer.bos_id()] + self.llama.tokenizer.encode(string)
def tok_decode(self, tokens):
return self.llama.tokenizer.decode(tokens)
def _model_call(self, inps):
Tensor.no_grad = True
return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy())
def greedy_until(self, requests):
continuations = []
for request in requests:
prompt, until = request[0], request[1]['until']
output = self.llama.greedy_until(prompt, until, max_length=128, temperature=0.0)
continuations.append(output[len(prompt):])
return continuations
def _model_generate(self, context, max_length, eos_token_id):
raise NotImplementedError()
if __name__ == '__main__':
print(f"using {Device.DEFAULT} backend")
parser = argparse.ArgumentParser(description='Run LLaMA evals in tinygrad', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--size', type=str, default="7B", help="Size of model to use [7B, 13B, 30B, 65B] for Gen 1, [7B, 13B] for Gen 2")
parser.add_argument('--gen', type=int, default="1", help="Generation of the model to use [1, 2]")
parser.add_argument('--quantize', action='store_true', help="Quantize the weights to int8 in memory")
parser.add_argument('--eval', type=str, default="arc_easy", help="Run in evaluation mode")
parser.add_argument('--limit', type=int, default=None, help="Limit tests in eval")
parser.add_argument('--weights', type=str, default="./weights/LLaMa/", help="Location of the weights")
parser.add_argument('--tokenizer', type=str, default="./weights/LLaMa/tokenizer.model", help="Location of the tokenizer")
args = parser.parse_args()
# run eval and exit
adaptor = LLaMaAdaptor(model_gen=args.gen, model_size=args.size, quantize=args.quantize, checkpoint_path=args.weights, tokenizer_path=args.tokenizer, device="cpu")
results = evaluator.evaluate(adaptor, tasks.get_task_dict(args.eval.split(",")), False, 0, args.limit)
print(json.dumps(results, indent=2))

View File

@@ -0,0 +1,128 @@
import csv, pathlib, time, numpy as np
from os import getenv
import torch
torch.set_num_threads(1)
import onnx
from onnx.helper import tensor_dtype_to_np_dtype
import onnxruntime as ort
from onnx2torch import convert
from extra.utils import download_file
from extra.onnx import get_run_onnx
from tinygrad.helpers import OSX, DEBUG
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
MODELS = {
"resnet50": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
"openpilot": "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx",
"efficientnet": "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx",
"shufflenet": "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx",
"commavq": "https://huggingface.co/commaai/commavq-gpt2m/resolve/main/gpt2m.onnx",
# broken in torch MPS
#"zfnet": "https://github.com/onnx/models/raw/main/vision/classification/zfnet-512/model/zfnet512-9.onnx",
# TypeError: BatchNormalization() got an unexpected keyword argument 'is_test'
#"densenet": "https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-3.onnx",
# AssertionError: only onnx version >= 10 supported for slice
#"bert": "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-8.onnx",
# really slow
#"resnet18": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx",
}
CSV = {}
open_csv = None
torch.manual_seed(1)
def benchmark(mnm, nm, fxn):
tms = []
for _ in range(3):
st = time.perf_counter_ns()
ret = fxn()
tms.append(time.perf_counter_ns() - st)
print(f"{mnm:15s} {nm:25s} {min(tms)*1e-6:7.2f} ms")
CSV[nm] = min(tms)*1e-6
return min(tms), ret
#BASE = pathlib.Path(__file__).parents[2] / "weights" / "onnx"
BASE = pathlib.Path("/tmp/onnx")
def benchmark_model(m, validate_outs=False):
global open_csv, CSV
CSV = {"model": m}
fn = BASE / MODELS[m].split("/")[-1]
download_file(MODELS[m], fn)
onnx_model = onnx.load(fn)
output_names = [out.name for out in onnx_model.graph.output]
excluded = {inp.name for inp in onnx_model.graph.initializer}
input_shapes = {inp.name:tuple(x.dim_value if x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded}
input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input if inp.name not in excluded}
#input_types = {k:v if v!=np.float16 else np.float32 for k,v in input_types.items()} # cast
np_inputs = {k:torch.randn(shp).numpy().astype(input_types[k]) for k,shp in input_shapes.items()}
assert len(input_shapes) < 30, f"too many input shapes {len(input_shapes)}"
# print input names
if DEBUG >= 2: print([inp.name for inp in onnx_model.graph.input if inp.name not in excluded])
for device in ["METAL" if OSX else "GPU", "CLANG"]: # + (["CUDA"] if torch.cuda.is_available() else []):
Device.DEFAULT = device
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = get_run_onnx(onnx_model)
benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
from tinygrad.jit import TinyJit
tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()})
del inputs, tinygrad_model, tinygrad_jitted_model
try:
torch_model = convert(onnx_model)
torch_inputs = [torch.tensor(x) for x in np_inputs.values()]
benchmark(m, "torch_cpu", lambda: torch_model(*torch_inputs))
torch_device = "mps" if OSX else "cuda"
torch_mps_model = torch_model.to(torch_device)
torch_mps_inputs = [x.to(torch_device) for x in torch_inputs]
benchmark(m, f"torch_{torch_device}", lambda: torch_mps_model(*torch_mps_inputs))
except Exception as e: print(f"{m:16s}onnx2torch {type(e).__name__:>25}")
# bench onnxruntime
ort_options = ort.SessionOptions()
ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
ort_options.log_severity_level = 3 # no warnings
for backend in ["CPU", "CUDA" if not OSX else "CoreML"]: # https://onnxruntime.ai/docs/execution-providers/
provider = backend+"ExecutionProvider"
if provider not in ort.get_available_providers(): continue
ort_sess = ort.InferenceSession(str(fn), ort_options, [provider])
benchmark(m, f"onnxruntime_{backend.lower()}", lambda: ort_sess.run(output_names, np_inputs))
del ort_sess
if validate_outs:
rtol, atol = 2e-3, 2e-3 # tolerance for fp16 models
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = get_run_onnx(onnx_model)
tinygrad_out = tinygrad_model(inputs)
ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
onnx_out = ort_sess.run(output_names, np_inputs)
onnx_out = dict([*[(name,x) for name, x in zip(output_names, onnx_out)]])
assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
print(f"{m:16s}outputs validated with rtol={rtol:.1e}, atol={atol:.1e}")
if open_csv is None:
open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))
open_csv.writeheader()
open_csv.writerow(CSV)
def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5):
assert len(tiny_out) == len(onnx_out) and tiny_out.keys() == onnx_out.keys()
for k in tiny_out.keys():
tiny_v, onnx_v = tiny_out[k], onnx_out[k]
if tiny_v is None: assert tiny_v == onnx_v
else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}")
if __name__ == "__main__":
if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), True)
else:
for m in MODELS: benchmark_model(m, True)

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python3
# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored
from tinygrad.helpers import Timing
from tinygrad.runtime.ops_gpu import CL
# TODO: support multidevice in cuda
device = 'gpu'
if __name__ == "__main__":
sz = 1024*1024*256 # 1 GB
#sz = 1024*64
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
c0 = Tensor.ones(sz, device="cpu").realize()
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a0 = c0.to(f'{device}:0').realize()
CL.synchronize()
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b1 = c1.to(f'{device}:1').realize()
CL.synchronize()
# cross copy. this is going through the CPU
with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a1 = a0.to(f'{device}:1').realize()
CL.synchronize()
with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b0 = b1.to(f'{device}:0').realize()
CL.synchronize()
# sum
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab0 = (a0 + b0).realize()
CL.synchronize()
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab1 = (a1 + b1).realize()
CL.synchronize()
# cross device sum (does this work?)
# is this making a copy first? is that copy through the CPU?
# the slowness comes from the *blocking* clprg call, is this pyopencl?
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx0 = (a0 + b1).realize()
CL.synchronize()
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx1 = (b1 + a0).realize()
CL.synchronize()
# copy back
# NOTE: half of this slowness is caused by allocating memory on the CPU
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc0 = ab0.numpy()
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc1 = ab1.numpy()
# same
print("testing")
np.testing.assert_allclose(cc0, cc1)
# devices
print(ab0)
print(ab1)
print(abx0)
print(abx1)

View File

@@ -0,0 +1,41 @@
from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
from tinygrad.helpers import dtypes
import time
N = 1000000
a = CLBuffer(N, dtypes.float32)
b = CLBuffer(N, dtypes.float32)
c = CLBuffer(N, dtypes.float32)
prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
int idx = get_global_id(0);
a[idx] = b[idx] + c[idx];
}""")
prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
t1 = time.monotonic_ns()
e1 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
CL.synchronize()
t2 = time.monotonic_ns()
time.sleep(3)
t3 = time.monotonic_ns()
e2 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
CL.synchronize()
t4 = time.monotonic_ns()
print(e1.profile.queued)
print(e1.profile.submit)
print(e1.profile.start)
print(e1.profile.end)
print(e1, e2)
print(t2-t1, e1.profile.end - e1.profile.start)
print(t4-t3, e2.profile.end - e2.profile.start)
print(t3-t2, e2.profile.queued-e1.profile.end)
print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
print("ratio since boot", t1/e1.profile.start)
print(e1.profile.start)
print(e1.profile.end)
print(e2.profile.start)
print(e2.profile.end)

View File

@@ -0,0 +1,125 @@
#!/usr/bin/env python
import unittest, gc
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.nn.state import get_state_dict
from tinygrad.helpers import GlobalCounters
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
from tinygrad.helpers import dtypes, prod
from tinygrad.ops import Device
from test.helpers import derandomize_model
from examples.llama import Transformer
ALLOCATED_DEV_BUFS = 0
class FakeDeviceBuffer:
def __init__(self, sz, dt, device):
self.id = 1
self.size = sz
self.dtype = dt
self.device = device
global ALLOCATED_DEV_BUFS
ALLOCATED_DEV_BUFS += 1
class FakeAllocator(LRUAllocator):
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
def _do_free(self, buf):
buf.id -= 1
assert buf.id == 0, f"Free should be called once, but {buf.id}"
def __del__(self): # Fake allocator should clear all buffers after each test.
for v in self.cached_buffers.values():
for buf, _ in v: self._free_buffer(buf)
FAKE_GLOBAL_ALLOCATOR = None
class FakeBuffer(RawBuffer):
def __init__(self, size, dtype, device='0'):
global FAKE_GLOBAL_ALLOCATOR
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
class FakeProgram:
def __init__(self, name:str, prg:str): pass
def __call__(self, *bufs, global_size, local_size, wait=False): pass
def helper_test_correctness(gen, train):
from tinygrad.runtime.ops_gpu import CL, CLAllocator
old_alloc = CL.cl_allocator
CL.cl_allocator = CLAllocator(0)
no_alloc_result = train(*gen()).numpy()
Device[Device.DEFAULT].synchronize()
CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
for _ in range(4):
GlobalCounters.reset()
np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
Device[Device.DEFAULT].synchronize()
assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
CL.cl_allocator = old_alloc
def __helper_test_alloc_count(gen, train):
was_alloc = ALLOCATED_DEV_BUFS
for _ in range(2):
train(*gen())
return ALLOCATED_DEV_BUFS - was_alloc
def helper_test_alloc_count(mm, gen, train):
global FAKE_GLOBAL_ALLOCATOR
backup_program = Device[Device.DEFAULT].runtime
backup_buffer = Device[Device.DEFAULT].buffer
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].buffer = FakeBuffer
Device[Device.DEFAULT].method_cache.clear()
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
new_allocs = __helper_test_alloc_count(gen, train)
Device[Device.DEFAULT].method_cache.clear()
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
old_allocs = __helper_test_alloc_count(gen, train)
print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
assert new_allocs < old_allocs, f"Hmm, doesn't cache work any more?"
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].buffer = backup_buffer
FAKE_GLOBAL_ALLOCATOR = None
def check_gc():
if Device.DEFAULT == "GPU":
gc.collect() # Need to collect Tensors.
from extra.introspection import print_objects
assert print_objects() == 0
class TestAllocators(unittest.TestCase):
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
def test_lru_allocator_tiny_llama(self):
old_type = Tensor.default_type
Tensor.default_type = dtypes.float16
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
def __test():
model = Transformer(**args_tiny)
derandomize_model(model)
def test(t): return model(t, 0).realize()
helper_test_correctness(lambda: (Tensor([[1,]]),), test)
__test()
Tensor.default_type = old_type
check_gc()
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
def test_lru_allocator_tiny_llama_alloc_counts(self):
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
def test_alloc_count(t):
model = Transformer(**args_tiny)
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
return model(t, 0).realize()
helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
check_gc()
@unittest.skip("huge for CI")
def test_stable_diffusion(self):
from examples.stable_diffusion import UNetModel
model = UNetModel()
derandomize_model(model)
def test(t, t2): return model(t, 801, t2).realize()
helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,8 @@
from tinygrad.tensor import Tensor
from tinygrad.nn import Embedding
if __name__ == "__main__":
vocab_size = 50257
dim = 128
test = Embedding(vocab_size, dim)
ret = test(Tensor([[1,2,3]])).numpy()

View File

@@ -0,0 +1,208 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps
from tinygrad.shape.shapetracker import ShapeTracker, View, ZeroView
from tinygrad.runtime.ops_gpu import GPUBuffer, CLProgram, CLCodegen
#from tinygrad.runtime.ops_metal import MetalBuffer as GPUBuffer, MetalProgram as CLProgram, MetalCodegen as CLCodegen
from tinygrad.helpers import getenv
from extra.lib_test_ast import test_ast
import platform
OSX = platform.system() == "Darwin"
def compile_and_test_ast(ast, local_size=None):
k = CLCodegen(ast)
prg = k.codegen().build(CLProgram)
if local_size is not None: prg.local_size = local_size
for i in range(5): prg(prg.lower(k.bufs))
if getenv("TEST", 0): test_ast(k)
class TestAST(unittest.TestCase):
def test_conv_zeroview_ast(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (2, 2, 2, 1), -3), ZeroView((1, 1, 1, 2), ((0, 1), (0, 1), (-1, 2), (-1, 3))), View((1, 1, 3, 4), (0, 0, 4, 1), 0)]))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (0, 0, 0, 0), 0)]))
op1 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
ast = LazyOp(UnaryOps.RELU, (op1,), None)
compile_and_test_ast(ast)
def test_cifar_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 64, 34, 34), (65536, 1024, 32, 1), -33), ZeroView((512, 64, 32, 32), ((0, 512), (0, 64), (-1, 33), (-1, 33))), View((512, 1, 128, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 1, 128, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 64, 3, 3), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
ast = LazyOp(ReduceOps.SUM, (op0,), (512, 1, 128, 32, 32, 1, 1, 1))
compile_and_test_ast(ast)
def test_cifar_conv_backward(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 512, 10, 10), (64, 16384, 8, 1), -9), ZeroView((256, 512, 8, 8), ((0, 256), (0, 512), (-1, 9), (-1, 9))), View((256, 1, 512, 3, 3, 512, 8, 8), (51200, 51200, 0, 10, 1, 100, 10, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 256, 8, 8), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 1, 512, 3, 3, 512, 8, 8), (0, 0, 64, 0, 0, 32768, 8, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 512, 8, 8), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (256, 1, 512, 3, 3, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (256, 512, 3, 3))
compile_and_test_ast(ast)
def test_first_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 130, 258, 1, 12), (393216, 3072, 12, 12, 1), -3084), ZeroView((1, 128, 256, 1, 12), ((0, 1), (-1, 129), (-1, 257), (0, 1), (0, 12))), View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 6192, 24, 0, 0, 3096, 12, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 768, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 0, 0, 432, 4, 144, 16, 48, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 108, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
compile_and_test_ast(ast)
def test_second_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 66, 130, 32, 1), (262144, 4096, 32, 1, 1), -4128), ZeroView((1, 64, 128, 32, 1), ((0, 1), (-1, 65), (-1, 129), (0, 32), (0, 1))), View((1, 64, 128, 8, 4, 1, 1, 3, 3), (266240, 4160, 32, 4, 1, 12480, 12480, 4160, 32), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 64, 128, 8, 4, 1, 1, 3, 3), (0, 0, 0, 36, 1, 0, 0, 12, 4), 0)]), hostbuf=GPUBuffer(shape=(8, 9, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
compile_and_test_ast(ast)
def test_third_op_conv(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4))
compile_and_test_ast(ast)
# VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op
# 164 time 27.75 ms running re_S128_4 with [128] None count 4 runtime 1016.06 us 2.07 GFLOPS () -> (128, 1)
# 169 time 22.51 ms running matmul with [4, 16, 128] [4, 16, 16] count 5 runtime 110.08 us 19.06 GFLOPS ('-DMATMUL',) -> (128, 1)
def test_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 128, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 128, 4, 1, 1, 1, 1), (512, 512, 512, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
compile_and_test_ast(ast)
def test_alt_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
compile_and_test_ast(ast)
# re_S32_16_36_6 is fast
def test_1x1_36_6(self): # 36 <- 6
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 1536, 24, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 0, 0, 0, 0, 96, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(36, 24, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(144,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
op6 = LazyOp(BinaryOps.MAX, (op5,buf5,), None)
op7 = LazyOp(BinaryOps.SUB, (op3,op6,), None)
ast = LazyOp(MovementOps.RESHAPE, (op7,), (32, 2304, 4))
compile_and_test_ast(ast, None if OSX else (16, 16, 4))
# re_S32_16_6_36 is slow
def test_1x1_6_36(self): # 6 <- 36
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 9216, 144, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 2304, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 0, 0, 0, 0, 576, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 144, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 1536, 24, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
op3 = LazyOp(BinaryOps.ADD, (op2,buf3,), None)
ast = LazyOp(MovementOps.RESHAPE, (op3,), (32, 384, 4))
compile_and_test_ast(ast, (6, 16, 4))
# re_S32_16_6_24
def test_1x1_6_24(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 6144, 96, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 1536, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 0, 0, 0, 0, 384, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 96, 4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
#buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
#op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 384, 4))
compile_and_test_ast(ast, (6, 4, 8))
def test_full_reduce_op(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (512, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
op0 = LazyOp(BinaryOps.ADD, (buf0,buf1,), None)
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([2.], dtype=np.float32)))
op1 = LazyOp(BinaryOps.POW, (op0,buf2,), None)
op2 = LazyOp(ReduceOps.SUM, (op1,), (1, 1))
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([0.5], dtype=np.float32)))
op3 = LazyOp(BinaryOps.POW, (op2,buf3,), None)
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
op4 = LazyOp(BinaryOps.SUB, (op3,buf4,), None)
op5 = LazyOp(UnaryOps.RELU, (op4,), None)
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
op6 = LazyOp(BinaryOps.ADD, (op5,buf5,), None)
buf6 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([3.4e+38], dtype=np.float32)))
op7 = LazyOp(BinaryOps.SUB, (op3,buf6,), None)
op8 = LazyOp(UnaryOps.RELU, (op7,), None)
op9 = LazyOp(BinaryOps.SUB, (op6,op8,), None)
op10 = LazyOp(UnaryOps.RECIPROCAL, (op9,), None)
ast = LazyOp(MovementOps.RESHAPE, (op10,), (1, 1))
compile_and_test_ast(ast)
def test_1239_reduce(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 64, 4), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 1024, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(1239, 256,
4), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1239, 4, 1, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 1, 1, 1, 4956))
compile_and_test_ast(ast)
def test_enet_first_conv_bs32(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 1, 32, 112, 112, 3, 3, 3), (0, 0, 27, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 3, 3), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (8, 1, 32, 112, 112, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (8, 32, 112, 112))
compile_and_test_ast(ast)
def test_enet_reduce_bs32(self):
buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
buf1 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 1, 32, 3, 3, 32, 112, 112), (0, 12845056, 401408, 0, 0, 12544, 112, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 32, 1, 1, 32, 112, 112), force_create=True))
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (3, 1, 32, 3, 3, 1, 1, 1))
ast = LazyOp(MovementOps.RESHAPE, (op1,), (3, 32, 3, 3))
compile_and_test_ast(ast)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python
import os
import unittest
import numpy as np
if 'IMAGE' not in os.environ:
os.environ['IMAGE'] = '2'
os.environ['GPU'] = '1'
os.environ['OPT'] = '2'
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d
Tensor.no_grad = True
class TestImage(unittest.TestCase):
def test_create_image(self):
t = Tensor.ones(128, 128, 1)
t = t.reshape(128, 32, 4) + 3
t.realize()
np.testing.assert_array_equal(t.numpy(), np.ones((128,32,4))*4)
def test_sum_image(self):
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
t1.realize()
t1 = t1.sum()
t1.realize()
assert t1.numpy() == 16*4*4*4, f"got {t1.numpy()}"
def test_add_image(self):
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
t2 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 4
t1.realize()
t2.realize()
t3 = t1 + t2
t3.realize()
np.testing.assert_array_equal(t3.numpy(), np.ones((16,4,4))*9)
def test_padded_conv(self):
bs, in_chans, out_chans = 1,12,32
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
tiny_dat = Tensor.ones(bs, 12, 64, 128)
tiny_conv(tiny_dat).realize()
def test_op_conv(self):
bs, in_chans, out_chans = 1,12,32
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
tiny_dconv = Conv2d(out_chans, out_chans, 1, bias=None, padding=0)
tiny_dat = Tensor.ones(bs, 12, 64, 128)
p2 = tiny_conv(tiny_dat).relu()
p2 = tiny_dconv(p2)
p2.realize()
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
from tinygrad.helpers import dtypes, CI
from tinygrad.ops import Device
from test.helpers import derandomize_model
from examples.llama import Transformer
def helper_test_jitted_correctness(gen, train, train_jit):
nojit = train(*gen()).numpy()
for _ in range(5): jit = train_jit(*gen()).numpy()
np.testing.assert_allclose(nojit, jit, rtol=1e-3, atol=1e-5)
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE, "needs JIT")
class TestJittedModels(unittest.TestCase):
def test_jitted_tiny_llama(self):
old_type = Tensor.default_type
Tensor.default_type = dtypes.float16
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
model = Transformer(**args_tiny)
derandomize_model(model)
def test(t): return model(t, 0).realize()
@TinyJit
def test_jit(t): return model(t, 0).realize()
helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
Tensor.default_type = old_type
@unittest.skipUnless(not CI, "huge for CI")
def test_jitted_stable_diffusion(self):
from examples.stable_diffusion import UNetModel
model = UNetModel()
derandomize_model(model)
def test(t, t2): return model(t, 801, t2).realize()
@TinyJit
def test_jit(t, t2): return model(t, 801, t2).realize()
helper_test_jitted_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test, test_jit)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,208 @@
import unittest
from onnx.backend.base import Backend, BackendRep
import onnx.backend.test
import numpy as np
from tinygrad.tensor import Tensor
from typing import Any, Tuple
from tinygrad.helpers import getenv, CI
# pip3 install tabulate
pytest_plugins = 'onnx.backend.test.report',
from extra.onnx import get_run_onnx
class TinygradModel(BackendRep):
def __init__(self, run_onnx, input_names):
super().__init__()
self.fxn = run_onnx
self.input_names = input_names
def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
real_inputs = {k:v for k,v in zip(self.input_names, inputs)}
ret = self.fxn(real_inputs, debug=True)
return tuple(x.numpy() if isinstance(x, Tensor) else [i.numpy() for i in x] if isinstance(x, list) else np.array(x) for x in ret.values())
class TinygradBackend(Backend):
@classmethod
def prepare(cls, model, device):
input_all = [x.name for x in model.graph.input]
input_initializer = [x.name for x in model.graph.initializer]
net_feed_input = [x for x in input_all if x not in input_initializer]
print("prepare", cls, device, net_feed_input)
run_onnx = get_run_onnx(model)
return TinygradModel(run_onnx, net_feed_input)
@classmethod
def supports_device(cls, device: str) -> bool:
return device == "CPU"
backend_test = onnx.backend.test.BackendTest(TinygradBackend, __name__)
# no support for reduce with multiply (needs llop)
backend_test.exclude('test_reduce_prod_*')
# TODO figure out why it's returning wrong values, geohotstan's uneducated guess is it's due to imprecision from float64 (double) -> float32
# see Type Constraints: https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adam.html#type-constraints
backend_test.exclude('test_adam_multiple_cpu')
backend_test.exclude('test_nesterov_momentum_cpu')
# we only support float32
backend_test.exclude('uint8')
backend_test.exclude('uint16')
backend_test.exclude('uint32')
backend_test.exclude('uint64')
backend_test.exclude('int8')
backend_test.exclude('int16')
backend_test.exclude('float64')
backend_test.exclude('string')
backend_test.exclude('test_pow_types_int*')
backend_test.exclude('test_cast_*')
backend_test.exclude('test_castlike_*')
backend_test.exclude('test_convinteger_*')
backend_test.exclude('test_matmulinteger_*')
backend_test.exclude('test_reduce_log_sum_exp*') # dependent on actual float64 implementation for backends
backend_test.exclude('test_operator_add*') # dependent on float64 math. Without it values default to 0 or inf
# we don't support indexes
# backend_test.exclude('test_argmax_*') # Needs more work: select_last_index
# backend_test.exclude('test_argmin_*') # Needs more work: select_last_index
backend_test.exclude('test_nonzero_*')
# no support for mod
backend_test.exclude('test_mod_*')
# no boolean ops (2d, 3d, 4d)
backend_test.exclude('test_bitshift_*')
# no scatternd gathernd
backend_test.exclude('test_gathernd_*')
backend_test.exclude('test_scatternd_*')
# no quantize
backend_test.exclude('test_dynamicquantizelinear_*')
backend_test.exclude('test_qlinearmatmul_*')
backend_test.exclude('test_qlinearconv_*')
backend_test.exclude('test_quantizelinear_*')
# no rnn
backend_test.exclude('test_gru_*')
backend_test.exclude('test_rnn_*')
backend_test.exclude('test_lstm_*')
backend_test.exclude('test_simple_rnn_*')
# no control flow
backend_test.exclude('test_if_*')
backend_test.exclude('test_loop*')
backend_test.exclude('test_range_float_type_positive_delta_expanded_cpu') # requires loop
# unsupported (strange) ops
backend_test.exclude('test_bitwise_*')
backend_test.exclude('test_blackmanwindow_*')
backend_test.exclude('test_bernoulli_*')
backend_test.exclude('test_cumsum_*')
backend_test.exclude('test_det_*')
backend_test.exclude('test_tril_zero_cpu') # TODO: zero array support
backend_test.exclude('test_triu_zero_cpu') # TODO: zero array support
backend_test.exclude('test_col2im_*')
backend_test.exclude('test_hammingwindow_*')
backend_test.exclude('test_hannwindow_*')
backend_test.exclude('test_hardmax_*')
backend_test.exclude('test_gridsample_*')
backend_test.exclude('test_dft_*')
backend_test.exclude('test_einsum_*')
backend_test.exclude('test_strnorm_*')
backend_test.exclude('test_unique_*')
backend_test.exclude('test_sequence_*')
backend_test.exclude('test_nonmaxsuppression_*')
backend_test.exclude('test_reversesequence_*')
backend_test.exclude('test_roialign_*')
backend_test.exclude('test_top_k_*')
backend_test.exclude('test_tfidfvectorizer_*')
backend_test.exclude('test_stft_*')
backend_test.exclude('test_melweightmatrix_*')
# more strange ops
backend_test.exclude('test_basic_deform_conv_*')
backend_test.exclude('test_deform_conv_*')
backend_test.exclude('test_lppool_*')
backend_test.exclude('test_depthtospace_*')
backend_test.exclude('test_spacetodepth_*')
backend_test.exclude('test_scan*')
backend_test.exclude('test_split_to_sequence_*')
backend_test.exclude('test_resize_downsample_scales_cubic_*') # unsure how to implement cubic
backend_test.exclude('test_resize_downsample_sizes_cubic_*') # unsure how to implement cubic
backend_test.exclude('test_resize_upsample_scales_cubic_*') # unsure how to implement cubic
backend_test.exclude('test_resize_upsample_sizes_cubic_*') # unsure how to implement cubic
# rest of the failing tests
backend_test.exclude('test_averagepool_2d_dilations_cpu') # dilations != 1 not supported for avgpool
backend_test.exclude('test_convtranspose_autopad_same_cpu') # TODO geohotstan has no idea how this is done, autopad requires output_shape but output_shape requires pads from autopad
backend_test.exclude('test_optional_has_element_empty_optional_input_cpu') # Attempts to create Tensor from None
backend_test.exclude('test_range_int32_type_negative_delta_expanded_cpu') # AttributeProto.GRAPH not implemented
backend_test.exclude('test_reshape_allowzero_reordered_cpu') # reshaping to 0 shape
backend_test.exclude('test_resize_downsample_scales_linear_antialias_cpu') # antialias not implemented
backend_test.exclude('test_resize_downsample_sizes_linear_antialias_cpu') # antialias not implemented
backend_test.exclude('test_resize_tf_crop_and_resize_cpu') # unsure about fill value after clip
backend_test.exclude('test_operator_addconstant_cpu') # bad data type
# issue 1556 https://github.com/tinygrad/tinygrad/issues/1556
backend_test.exclude('test_isinf_cpu')
backend_test.exclude('test_isinf_negative_cpu')
backend_test.exclude('test_isinf_positive_cpu')
backend_test.exclude('test_isnan_cpu')
# issue 1791 fast math messes with these https://github.com/tinygrad/tinygrad/issues/1791
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
# issue 2067 potentially also a fastmath issue https://github.com/tinygrad/tinygrad/issues/2067
if getenv('METAL'):
backend_test.exclude('test_maxpool_2d_pads_cpu')
backend_test.exclude('test_maxpool_2d_same_lower_cpu')
# Don't know how to treat special TensorProto like TensorProto.FLOAT8E4M3FN
if getenv("CPU") or getenv("TORCH"):
backend_test.exclude('test_dequantizelinear_axis_cpu')
backend_test.exclude('test_dequantizelinear_cpu')
# compiled backends cannot reshape to and from 0
if getenv('LLVM') or getenv('GPU') or getenv('CLANG') or getenv('METAL') or getenv('CUDA'):
backend_test.exclude('test_slice_start_out_of_bounds_cpu')
backend_test.exclude('test_constantofshape_int_shape_zero_cpu')
if getenv('GPU') or getenv('METAL'):
backend_test.exclude('test_mish_cpu') # weird inaccuracy
backend_test.exclude('test_mish_expanded_cpu') # weird inaccuracy
backend_test.exclude('test_eyelike_with_dtype_cpu') # backend does not support dtype: Double
# Segfaults in CI
if (getenv('LLVM') or getenv('CUDA')) and CI:
backend_test.exclude('test_max_float16_cpu')
backend_test.exclude('test_min_float16_cpu')
# disable model tests for now since they are slow
if not getenv("MODELTESTS"):
for x in backend_test.test_suite:
if 'OnnxBackendRealModelTest' in str(type(x)):
backend_test.exclude(str(x).split(" ")[0])
else:
# model tests all pass!
backend_test.include('test_resnet50')
backend_test.include('test_inception_v1')
backend_test.include('test_inception_v2')
backend_test.include('test_densenet121')
backend_test.include('test_shufflenet')
backend_test.include('test_squeezenet')
backend_test.include('test_bvlc_alexnet')
backend_test.include('test_zfnet512')
backend_test.include('test_vgg19')
globals().update(backend_test.enable_report().test_cases)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,392 @@
#!/usr/bin/env python
import os
import torch
if "OPT" not in os.environ:
os.environ["OPT"] = "2"
import gc
import numpy as np
import unittest
from tinygrad.tensor import Tensor, Device
from tinygrad import nn
from tinygrad.helpers import getenv
from tinygrad.nn import optim
from tinygrad.helpers import GlobalCounters
from tinygrad.lazy import PUSH_PERMUTES
from tinygrad.jit import CacheCollector
class CLCache:
def __init__(self, allowed=None, strict=False, preclear=True): self.allowed, self.strict, self.preclear = allowed, strict, preclear
def __enter__(self):
if self.preclear:
gc.collect()
for x in [x for x in gc.get_objects() if isinstance(x, Tensor)]:
x.realize()
GlobalCounters.reset()
CacheCollector.start()
print("cache: entering")
def __exit__(self, type, value, traceback):
cache = CacheCollector.finish()
print(f"cache: exiting with size {len(cache)}", f"allowed {self.allowed}" if self.allowed is not None else "")
if self.allowed is not None:
assert len(cache) <= self.allowed and (not self.strict or len(cache) == self.allowed), f"used too many kernels! {len(cache)} > {self.allowed}"
from models.convnext import ConvNeXt
from models.efficientnet import EfficientNet
from models.resnet import ResNet18
from models.vit import ViT
from tinygrad.nn.state import get_parameters
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestInferenceMinKernels(unittest.TestCase):
def setUp(self):
Tensor.training = False
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
def test_convnext(self):
model = ConvNeXt()
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(129):
model(img).realize()
def test_enet(self):
model = EfficientNet(getenv("ENET_NUM", 0), has_se=False)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(51):
model.forward(img).realize()
def test_enet_se(self):
model = EfficientNet(getenv("ENET_NUM", 0), has_se=True)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
# TODO: this seems very high
with CLCache(115):
model.forward(img).realize()
def test_resnet(self):
model = ResNet18()
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(26):
model.forward(img).realize()
def test_vit(self):
model = ViT(embed_dim=192, num_heads=3)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
img = Tensor.randn(1, 3, 224, 224)
with CLCache(222): # NOTE: this is way too high
out = model.forward(img)
assert len(CacheCollector.cache) == 0, "ViT prerealized?"
out.realize()
def test_llama(self):
from examples.llama import Transformer
args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
model = Transformer(**args_tiny)
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
with CLCache(85):
model(Tensor([[1,2,3,4]]), 0).realize()
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOptBinOp(unittest.TestCase):
def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
c = f1(a, b)
if f2 is not None: d = f2(a, b)
c.realize()
if f2 is not None: d.realize()
assert len(CacheCollector.cache) == allowed, "binop was rerun!"
if f2 is not None: np.testing.assert_allclose(c.numpy().ravel(), d.numpy().ravel(), rtol=1e-3, atol=1e-5)
def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
def test_no_binop_rerun_reduce_broadcast(self): return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
@unittest.skip("this test started failing with the new change, based movementop issue")
def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
def test_no_binop_rerun_mid_reshape(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(256)+a.reshape(256))
# currently non working tests
#def test_no_binop_rerun_preshape(self): return self._test_no_binop_rerun(lambda a,b: a.reshape(16, 16, 1)*b.reshape(16, 16, 1), lambda a,b: a*b)
#def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
#def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOptReduceLoop(unittest.TestCase):
@unittest.skip("this is broken")
def test_loop_left(self):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
t = a.sum(0)
b = t.reshape(16,1).expand(16,16).sum(0)
c = (t+b)
c.realize()
assert len(CacheCollector.cache) == 2, "loop left fusion broken"
def test_loop_right(self):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
t = a.sum(0)
b = t.reshape(16,1).expand(16,16).sum(0)
c = (b+t)
c.realize()
assert len(CacheCollector.cache) == 2, "loop right fusion broken"
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOptWChild(unittest.TestCase):
def test_unrealized_child(self):
a = Tensor.randn(16, 16)
b = Tensor.randn(16, 16)
with CLCache():
c = (a*b).sum()
d = c+1
e = c+2
d.realize()
assert len(CacheCollector.cache) == 2, "don't fuse if you have children"
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestOpt(unittest.TestCase):
def test_muladd(self):
a,b,c = [Tensor.ones(2,2) for _ in range(3)]
with CLCache():
d = a * b + c
d.realize()
assert len(CacheCollector.cache) == 1, "optimizer didn't fold muladd"
np.testing.assert_allclose(d.numpy(), np.ones((2,2))*2, rtol=1e-5)
def test_fold_reduce_elementwise(self):
img = Tensor.ones(32)
addme = Tensor.ones(1)
with CLCache():
ret = img.sum() + addme
ret.realize()
assert len(CacheCollector.cache) == 1, "optimizer didn't fold reduce/elementwise"
assert ret.numpy()[0] == 33
def test_fold_batchnorm(self):
with Tensor.train():
img = Tensor.ones(1,32,4,4)
bn = nn.BatchNorm2d(32, track_running_stats=False)
with CLCache():
img_bn = bn(img).realize()
print(img_bn)
assert len(CacheCollector.cache) == 3, f"optimizer didn't fold batchnorm, got {len(CacheCollector.cache)}"
# Tensor.training = False
def test_fold_conv_sgd(self):
with Tensor.train():
img = Tensor.ones(2,3,4,4)
c1 = nn.Conv2d(3,32,3)
opt = optim.SGD(get_parameters(c1))
with CLCache():
opt.zero_grad()
c1(img).relu().sum().backward()
opt.step()
# TODO: this should be 4, but the sum output child stays around
# with pushing_permutes it can be 3
# TODO: broken with optim fixes
assert len(CacheCollector.cache) in [4,5,6], f"optimizer didn't fold conv-backward SGD, got {len(CacheCollector.cache)}"
# Tensor.training = False
def test_fold_2convs_sgd(self):
with Tensor.train():
img = Tensor.ones(2,3,64,64)
c1 = nn.Conv2d(3,16,3,bias=False)
c2 = nn.Conv2d(16,32,3,bias=False)
opt = optim.SGD(get_parameters([c1, c2]))
with CLCache(allowed=9):
opt.zero_grad()
c2(c1(img).relu()).relu().sum().backward()
opt.step()
# Tensor.training = False
def test_fold_4convs_sgd(self):
with Tensor.train():
img = Tensor.ones(2,3,64,64)
c1 = nn.Conv2d(3,4,3,bias=False)
c2 = nn.Conv2d(4,8,3,bias=False)
c3 = nn.Conv2d(8,16,3,bias=False)
c4 = nn.Conv2d(16,32,3,bias=False)
opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
with CLCache(allowed=19):
opt.zero_grad()
c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
opt.step()
# Tensor.training = False
def test_fold_conv_batchnorm_sgd(self):
with Tensor.train():
img = Tensor.ones(1,3,4,4)
c1 = nn.Conv2d(3,32,3)
bn = nn.BatchNorm2d(32, track_running_stats=False)
opt = optim.SGD(get_parameters([c1, bn]))
with CLCache(allowed=18): # this is too high
img_bn = bn(c1(img)).elu().sum()
opt.zero_grad()
img_bn.backward()
opt.step()
# Tensor.training = False
def test_fold_conv_batchnorm_notrain(self):
img = Tensor.ones(1,3,8,8)
c1 = nn.Conv2d(3,32,3)
bn = nn.BatchNorm2d(32, track_running_stats=False)
# precache the bn
img_conv = bn(c1(img)).relu().realize()
with CLCache():
img_conv = bn(c1(img)).relu().realize()
assert len(CacheCollector.cache) == 1, f"optimizer didn't fold conv-batchnorm at test time, got {len(CacheCollector.cache)}"
def test_fold_conv_batchnorm(self):
with Tensor.train():
img = Tensor.ones(1,3,8,8)
c1 = nn.Conv2d(3,32,3)
bn = nn.BatchNorm2d(32, track_running_stats=False)
with CLCache():
img_conv = bn(c1(img)).relu().realize()
print(img_conv)
assert len(CacheCollector.cache) == 4, f"optimizer didn't fold conv-batchnorm, got {len(CacheCollector.cache)}"
def test_fold_conv_elu(self):
img = Tensor.ones(1,4,8,8)
c1 = nn.Conv2d(4, 4, kernel_size=3)
c2 = nn.Conv2d(4, 4, kernel_size=3)
with CLCache():
img_conv = img.sequential([c1, Tensor.elu, c2, Tensor.elu]).realize()
print(img_conv)
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/elu"
def test_fold_conv_relu(self):
img = Tensor.ones(1,4,8,8)
c1 = nn.Conv2d(4, 4, kernel_size=3)
c2 = nn.Conv2d(4, 4, kernel_size=3)
with CLCache():
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
print(img_conv)
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
def test_fold_conv_relu_nobias(self):
img = Tensor.ones(1,4,8,8)
c1 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
c2 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
with CLCache():
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
print(img_conv)
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
def test_permute_was_pushed(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2)
d = c.permute(1,0).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
def test_permute_was_pushed_through_contract_reshape(self):
a = Tensor.randn(4, 4, 4, 4, 4)
with CLCache():
c = a.sum(-1)
d = c.reshape(16,16).permute(1,0).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,16).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
def test_permute_was_pushed_through_contractw1s_reshape(self):
a = Tensor.randn(4, 4, 4, 4, 4)
with CLCache():
c = a.sum(-1)
d = c.reshape(16,1,16).permute(2,1,0).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,1,16).transpose(2,1,0), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
# TODO: push permute through expansion reshape
@unittest.skip("expansion can't push expand permute yet")
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
def test_permute_was_pushed_through_expand_reshape(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2)
d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0).reshape(4,4,4,4), d.numpy(), rtol=1e-3, atol=1e-5)
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
def test_no_reduceop_rerun(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2)
d = a.sum(2).permute(1,0)
c.realize()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(c.numpy().transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
assert cache_len == 1, "reduceop was rerun!"
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
def test_no_reduceop_rerun_alt(self):
a = Tensor.randn(16, 16, 16)
with CLCache():
c = a.sum(2).permute(1,0)
d = a.sum(2)
c.realize()
d.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(c.numpy(), d.numpy().transpose(1,0), rtol=1e-3, atol=1e-5)
assert cache_len == 1, "reduceop was rerun!"
def test_fold_with_contiguous(self):
a = Tensor.randn(16, 16, 16)
b = Tensor.randn(16, 16)
with CLCache():
c = (a.sum(2).contiguous() + b).contiguous()
c.realize()
cache_len = len(CacheCollector.cache)
assert cache_len == 1, "contiguous wasn't folded"
def _test_fold_expand_reduce_helper(self, n, m, axis, allowed):
b = torch.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
with CLCache(allowed=allowed):
a = Tensor.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
a.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
return cache_len
def test_expand_reduce_is_folded_on_same_axis(self):
for axis in [0, 1]:
for n in [4, 8, 16]:
b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
with CLCache(allowed=2):
a = Tensor.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
a.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
return cache_len
def test_expand_reduce_is_not_folded_on_different_axes(self):
axis1, axis2 = 0, 1
for n in [4, 8, 16]:
b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
with CLCache(allowed=3):
a = Tensor.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
a.realize()
cache_len = len(CacheCollector.cache)
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
return cache_len
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python
import unittest
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tinygrad.tensor import Tensor
from tinygrad.nn.optim import LAMB
np.random.seed(1337)
x_init = np.random.randn(1,4).astype(np.float32)
W_init = np.random.randn(4,4).astype(np.float32)
m_init = np.random.randn(1,4).astype(np.float32)
class TinyNet:
def __init__(self):
self.x = Tensor(x_init.copy(), requires_grad=True)
self.W = Tensor(W_init.copy(), requires_grad=True)
self.m = Tensor(m_init.copy())
def forward(self):
out = self.x.matmul(self.W).relu()
out = out.log_softmax(1)
out = out.mul(self.m).add(self.m).sum()
return out
class TinyNetTF:
def __init__(self):
self.x = tf.Variable(x_init.copy(), trainable=True)
self.W = tf.Variable(W_init.copy(), trainable=True)
self.m = tf.constant(m_init.copy())
def forward(self):
out = tf.matmul(self.x, self.W)
out = tf.nn.relu(out)
out = tf.nn.log_softmax(out, axis=1)
out = tf.multiply(out, self.m) + self.m
out = tf.reduce_sum(out)
return out
def step(optim, steps=1, kwargs={}):
net = TinyNet()
optim = optim([net.x, net.W], **kwargs)
for _ in range(steps):
out = net.forward()
optim.zero_grad()
out.backward()
optim.step()
return net.x.detach().numpy(), net.W.detach().numpy()
def step_tf(optim, steps=1, kwargs={}):
net = TinyNetTF()
optim = optim(**kwargs)
for _ in range(steps):
with tf.GradientTape() as tape:
out = net.forward()
grads = tape.gradient(out, [net.x, net.W])
optim.apply_gradients(zip(grads, [net.x, net.W]))
return net.x.numpy(), net.W.numpy()
class ExternalTestOptim(unittest.TestCase):
def _test_optim(self, tinygrad_optim, tensorflow_optim, steps, opts, atol, rtol):
for x,y in zip(step(tinygrad_optim, steps, kwargs=opts),
step_tf(tensorflow_optim, steps, kwargs=opts)):
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
def _test_lamb(self, steps, opts, atol, rtol): self._test_optim(LAMB, tfa.optimizers.LAMB, steps, opts, atol, rtol)
def test_lamb(self): self._test_lamb(1, {'lr': 0.001}, 1e-5, 0)
def test_lamb_high_lr(self): self._test_lamb(1, {'lr': 10}, 1e-5, 1e-5)
def test_multistep_lamb(self): self._test_lamb(10, {'lr': 0.001}, 1e-5, 0)
def test_multistep_lamb_high_lr(self): self._test_lamb(10, {'lr': 10}, 1e-5, 3e-4)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,57 @@
# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
import unittest, time
import numpy as np
from examples.llama import Transformer, MODEL_PARAMS
from test.test_net_speed import start_profile, stop_profile
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
from tinygrad.nn.state import get_state_dict
from tinygrad.ops import Compiled
from tinygrad.helpers import dtypes, prod
from tinygrad.runtime.lib import RawBuffer
class FakeProgram:
def __init__(self, name:str, prg:str): pass
def __call__(self, *bufs, global_size, local_size, wait=False): pass
class RawFakeBuffer(RawBuffer):
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
class TestLLaMASpeed(unittest.TestCase):
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
def test_llama_compile(self):
backup_program = Device[Device.DEFAULT].runtime
backup_buffer = Device[Device.DEFAULT].buffer
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].buffer = RawFakeBuffer
print("testing llama python run time")
model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
print("built model")
# assign fake tensors to the values
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
print("assigned empty tensors, doing warmup")
def run_llama(st, empty_method_cache=True):
if empty_method_cache: Device[Device.DEFAULT].method_cache.clear()
tms = [time.perf_counter()]
for i in range(10):
model(Tensor([[2]]), i).realize()
tms.append(time.perf_counter())
timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
run_llama("codegen")
run_llama("methodcache", False)
pr = start_profile()
run_llama("profile")
stop_profile(pr, sort='time', frac=0.1)
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].buffer = backup_buffer
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python
import unittest
from tinygrad.tensor import Tensor
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.renderer.opencl import OpenCLRenderer
from tinygrad.graph import graph_uops
from tinygrad.nn import Conv2d
class TestUopsGraph(unittest.TestCase):
def test_matmul(self):
N = 1024
a = Tensor.rand(N,N)
b = Tensor.rand(N,N)
si = (a@b).lazydata.schedule()[-1]
lin = Linearizer(si.ast)
lin.hand_coded_optimizations()
print(lin.colored_shape())
uops = lin.linearize().uops
graph_uops(uops)
for u in uops: print(u)
print(OpenCLRenderer("matmul", uops)[0])
def test_reduce(self):
a = Tensor.rand(1024*1024)
si = a.sum().lazydata.schedule()[-1]
lin = Linearizer(si.ast)
lin.hand_coded_optimizations()
uops = lin.linearize().uops
graph_uops(uops)
#print(OpenCLRenderer("reduce", uops)[0])
def test_conv(self):
x = Tensor.rand(1,3,16,16)
c = Conv2d(3, 16, (3,3))
si = c(x).elu().lazydata.schedule()[-1]
lin = Linearizer(si.ast)
lin.hand_coded_optimizations()
uops = lin.linearize().uops
graph_uops(uops)
print(lin.colored_shape())
print(OpenCLRenderer("conv", uops)[0])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,36 @@
import io
import unittest
from pathlib import Path
import cv2
import requests # type: ignore
import numpy as np
from tinygrad.tensor import Tensor
from examples.yolov3 import Darknet, infer, show_labels
from extra.utils import fetch
chicken_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/Chicken.jpg'))
car_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/car.jpg'))
class TestYOLO(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"))
print("Loading weights file (237MB). This might take a while…")
cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
@classmethod
def tearDownClass(cls):
del cls.model
def test_chicken(self):
labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
self.assertEqual(labels, ["bird"])
def test_car(self):
labels = show_labels(infer(self.model, car_img))
self.assertEqual(labels, ["car"])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,76 @@
import numpy as np
from extra.utils import fetch, download_file, get_child
from examples.yolov8 import YOLOv8, get_variant_multiples, preprocess, postprocess, label_predictions
from pathlib import Path
import unittest
import io, cv2, os
import onnxruntime as ort
import ultralytics
from tinygrad.nn.state import safe_load, load_state_dict
class TestYOLOv8(unittest.TestCase):
def test_all_load_weights(self):
for variant in ['n', 's', 'm', 'l', 'x']:
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{variant}.safetensors', weights_location)
depth, width, ratio = get_variant_multiples(variant)
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
state_dict = safe_load(weights_location)
load_state_dict(TinyYolov8, state_dict)
print(f'successfully loaded weights for yolov{variant}')
def test_predictions(self):
test_image_urls = ['https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg', 'https://www.aljazeera.com/wp-content/uploads/2022/10/2022-04-28T192650Z_1186456067_UP1EI4S1I0P14_RTRMADP_3_SOCCER-ENGLAND-MUN-CHE-REPORT.jpg']
variant = 'n'
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
depth, width, ratio = get_variant_multiples(variant)
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
state_dict = safe_load(weights_location)
load_state_dict(TinyYolov8, state_dict)
for i in range(len(test_image_urls)):
img_stream = io.BytesIO(fetch(test_image_urls[i]))
img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
test_image = preprocess([img])
predictions = TinyYolov8(test_image)
post_predictions = postprocess(preds=predictions, img=test_image, orig_imgs=[img])
labels = label_predictions(post_predictions)
assert labels == {5: 1, 0: 4, 11: 1} if i == 0 else labels == {0: 13, 29: 1, 32: 1}
def test_forward_pass_torch_onnx(self):
variant = 'n'
weights_location_onnx = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.onnx'
weights_location_pt = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.pt'
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
download_file(f'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8{variant}.pt', weights_location_pt)
# the ultralytics export prints a lot of unneccesary things
if not weights_location_onnx.is_file():
model = ultralytics.YOLO(model=weights_location_pt, task='Detect')
model.export(format="onnx",imgsz=[640, 480])
depth, width, ratio = get_variant_multiples(variant)
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
state_dict = safe_load(weights_location)
load_state_dict(TinyYolov8, state_dict)
image_location = [np.frombuffer(io.BytesIO(fetch('https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg')).read(), np.uint8)]
orig_image = [cv2.imdecode(image_location[0], 1)]
input_image = preprocess(orig_image)
onnx_session = ort.InferenceSession(weights_location_onnx)
onnx_input_name = onnx_session.get_inputs()[0].name
onnx_output_name = onnx_session.get_outputs()[0].name
onnx_output = onnx_session.run([onnx_output_name], {onnx_input_name: input_image.numpy()})
tiny_output = TinyYolov8(input_image)
# currently rtol is 0.025 because there is a 1-2% difference in our predictions
# because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch.
# This difference does not make a difference "visually".
np.testing.assert_allclose(onnx_output[0], tiny_output.numpy(), atol=5e-4, rtol=0.025)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,61 @@
import random
from tinygrad.helpers import DEBUG
from test.unit.test_shapetracker import CheckingShapeTracker
random.seed(42)
def do_permute(st):
perm = list(range(0, len(st.shape)))
random.shuffle(perm)
perm = tuple(perm)
if DEBUG >= 1: print("st.permute(", perm, ")")
st.permute(perm)
def do_pad(st):
c = random.randint(0, len(st.shape)-1)
pad = tuple((random.randint(0,2), random.randint(0,2)) if i==c else (0,0) for i in range(len(st.shape)))
if DEBUG >= 1: print("st.pad(", pad, ")")
st.pad(pad)
def do_reshape_split_one(st):
c = random.randint(0, len(st.shape)-1)
poss = [n for n in [1,2,3,4,5] if st.shape[c]%n == 0]
spl = random.choice(poss)
shp = st.shape[0:c] + (st.shape[c]//spl, spl) + st.shape[c+1:]
if DEBUG >= 1: print("st.reshape(", shp, ")")
st.reshape(shp)
def do_reshape_combine_two(st):
if len(st.shape) < 2: return
c = random.randint(0, len(st.shape)-2)
shp = st.shape[:c] + (st.shape[c] * st.shape[c+1], ) + st.shape[c+2:]
if DEBUG >= 1: print("st.reshape(", shp, ")")
st.reshape(shp)
def do_shrink(st):
c = random.randint(0, len(st.shape)-1)
while 1:
shrink = tuple((random.randint(0,s), random.randint(0,s)) if i == c else (0,s) for i,s in enumerate(st.shape))
if all(x<y for (x,y) in shrink): break
if DEBUG >= 1: print("st.shrink(", shrink, ")")
st.shrink(shrink)
def do_stride(st):
c = random.randint(0, len(st.shape)-1)
stride = tuple(random.choice([-2,-1,2]) if i==c else 1 for i in range(len(st.shape)))
if DEBUG >= 1: print("st.stride(", stride, ")")
st.stride(stride)
def do_expand(st):
c = [i for i,s in enumerate(st.shape) if s==1]
if len(c) == 0: return
c = random.choice(c)
expand = tuple(random.choice([2,3,4]) if i==c else s for i,s in enumerate(st.shape))
if DEBUG >= 1: print("st.expand(", expand, ")")
st.expand(expand)
if __name__ == "__main__":
ops = [do_permute, do_pad, do_shrink, do_reshape_split_one, do_reshape_combine_two, do_stride, do_expand]
for _ in range(200):
st = CheckingShapeTracker((random.randint(2, 10), random.randint(2, 10), random.randint(2, 10)))
for i in range(8): random.choice(ops)(st)
st.assert_same()

View File

@@ -0,0 +1,69 @@
import itertools
import random
from tinygrad.helpers import DEBUG
from tinygrad.shape.symbolic import Variable
random.seed(42)
def add_v(expr, rng=None):
if rng is None: rng = random.randint(0,2)
return expr + v[rng], rng
def div(expr, rng=None):
if rng is None: rng = random.randint(1,9)
return expr // rng, rng
def mul(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr * rng, rng
def mod(expr, rng=None):
if rng is None: rng = random.randint(1,9)
return expr % rng, rng
def add_num(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr + rng, rng
def lt(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr < rng, rng
def ge(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr >= rng, rng
def le(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr <= rng, rng
def gt(expr, rng=None):
if rng is None: rng = random.randint(-4,4)
return expr > rng, rng
if __name__ == "__main__":
ops = [add_v, div, mul, add_num, mod]
for _ in range(1000):
upper_bounds = [*list(range(1, 10)), 16, 32, 64, 128, 256]
u1 = Variable("v1", 0, random.choice(upper_bounds))
u2 = Variable("v2", 0, random.choice(upper_bounds))
u3 = Variable("v3", 0, random.choice(upper_bounds))
v = [u1,u2,u3]
tape = [random.choice(ops) for _ in range(random.randint(2, 30))]
# 10% of the time, add one of lt, le, gt, ge
if random.random() < 0.1: tape.append(random.choice([lt, le, gt, ge]))
expr = Variable.num(0)
rngs = []
for t in tape:
expr, rng = t(expr)
if DEBUG >= 1: print(t.__name__, rng)
rngs.append(rng)
if DEBUG >=1: print(expr)
space = list(itertools.product(range(u1.min, u1.max+1), range(u2.min, u2.max+1), range(u3.min, u3.max+1)))
volume = len(space)
for (v1, v2, v3) in random.sample(space, min(100, volume)):
v = [v1,v2,v3]
rn = 0
for t,r in zip(tape, rngs): rn, _ = t(rn, r)
num = eval(expr.render())
assert num == rn, f"mismatched {expr.render()} at {v1=} {v2=} {v3=} = {num} != {rn}"
if DEBUG >= 1: print(f"matched {expr.render()} at {v1=} {v2=} {v3=} = {num} == {rn}")

View File

@@ -0,0 +1,61 @@
import unittest
from tinygrad.nn.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d, BatchNorm2d, optim
def model_step(lm):
with Tensor.train():
x = Tensor.ones(8,12,128,256, requires_grad=False)
optimizer = optim.SGD(get_parameters(lm), lr=0.001)
loss = lm.forward(x).sum()
optimizer.zero_grad()
loss.backward()
del x,loss
optimizer.step()
class TestBatchnorm(unittest.TestCase):
def test_conv(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
def forward(self, x):
return self.c(x).relu()
lm = LilModel()
model_step(lm)
def test_two_conv(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
def forward(self, x):
return self.c2(self.c(x)).relu()
lm = LilModel()
model_step(lm)
def test_two_conv_bn(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 24, 3, padding=1, bias=False)
self.bn = BatchNorm2d(24, track_running_stats=False)
self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
self.bn2 = BatchNorm2d(32, track_running_stats=False)
def forward(self, x):
x = self.bn(self.c(x)).relu()
return self.bn2(self.c2(x)).relu()
lm = LilModel()
model_step(lm)
def test_conv_bn(self):
class LilModel:
def __init__(self):
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
self.bn = BatchNorm2d(32, track_running_stats=False)
def forward(self, x):
return self.bn(self.c(x)).relu()
lm = LilModel()
model_step(lm)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,74 @@
import unittest
import numpy as np
from tinygrad.ops import Device
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv, CI
def multidevice_test(fxn):
exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
def ret(self):
for device in Device._buffers:
if device in ["DISK", "SHM", "FAKE"]: continue
if not CI: print(device)
if device in exclude_devices:
if not CI: print(f"WARNING: {device} test is excluded")
continue
with self.subTest(device=device):
try:
Device[device]
except Exception:
if not CI: print(f"WARNING: {device} test isn't running")
continue
fxn(self, device)
return ret
class TestExample(unittest.TestCase):
@multidevice_test
def test_convert_to_cpu(self, device):
a = Tensor([[1,2],[3,4]], device=device)
assert a.numpy().shape == (2,2)
b = a.cpu()
assert b.numpy().shape == (2,2)
@multidevice_test
def test_2_plus_3(self, device):
a = Tensor([2], device=device)
b = Tensor([3], device=device)
result = a + b
print(f"{a.numpy()} + {b.numpy()} = {result.numpy()}")
assert result.numpy()[0] == 5.
@multidevice_test
def test_example_readme(self, device):
x = Tensor.eye(3, device=device, requires_grad=True)
y = Tensor([[2.0,0,-2.0]], device=device, requires_grad=True)
z = y.matmul(x).sum()
z.backward()
x.grad.numpy() # dz/dx
y.grad.numpy() # dz/dy
assert x.grad.device == device
assert y.grad.device == device
@multidevice_test
def test_example_matmul(self, device):
try:
Device[device]
except Exception:
print(f"WARNING: {device} test isn't running")
return
x = Tensor.eye(64, device=device, requires_grad=True)
y = Tensor.eye(64, device=device, requires_grad=True)
z = y.matmul(x).sum()
z.backward()
x.grad.numpy() # dz/dx
y.grad.numpy() # dz/dy
assert x.grad.device == device
assert y.grad.device == device
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,50 @@
import unittest
from extra.export_model import export_model, EXPORT_SUPPORTED_DEVICE
from tinygrad.tensor import Tensor, Device
import json
class MockMultiInputModel:
def forward(self, x1, x2, x3):
return x1 + x2 + x3
class MockMultiOutputModel:
def __call__(self, x1):
return x1 + 2.0, x1.pad(((0, 0), (0, 1))) + 1.0
# TODO: move compile_efficientnet tests here
@unittest.skipUnless(Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, f"Model export is not supported on {Device.DEFAULT}")
class TextModelExport(unittest.TestCase):
def test_multi_input_model_export(self):
model = MockMultiInputModel()
inputs = [Tensor.rand(2,2), Tensor.rand(2,2), Tensor.rand(2,2)]
prg, inp_sizes, _, _ = export_model(model, "", *inputs)
prg = json.loads(prg)
assert len(inputs) == len(prg["inputs"]) == len(inp_sizes), f"Model and exported inputs don't match: mdl={len(inputs)}, prg={len(prg['inputs'])}, inp_sizes={len(inp_sizes)}"
for i in range(len(inputs)):
assert f"input{i}" in inp_sizes, f"input{i} not captured in inp_sizes"
assert f"input{i}" in prg["buffers"], f"input{i} not captured in exported buffers"
for i, exported_input in enumerate(prg["inputs"]):
assert inputs[i].dtype.name == exported_input["dtype"], f"Model and exported input dtype don't match: mdl={inputs[i].dtype.name}, prg={exported_input['dtype']}"
def test_multi_output_model_export(self):
model = MockMultiOutputModel()
input = Tensor.rand(2,2)
outputs = model(input)
prg, _, out_sizes, _ = export_model(model, "", input)
prg = json.loads(prg)
assert len(outputs) == len(prg["outputs"]) == len(out_sizes), f"Model and exported outputs don't match: mdl={len(outputs)}, prg={len(prg['outputs'])}, inp_sizes={len(out_sizes)}"
for i in range(len(outputs)):
assert f"output{i}" in out_sizes, f"output{i} not captured in out_sizes"
assert f"output{i}" in prg["buffers"], f"output{i} not captured in exported buffers"
for i, exported_output in enumerate(prg["outputs"]):
assert outputs[i].dtype.name == exported_output["dtype"], f"Model and exported output dtype don't match: mdl={outputs[i].dtype.name}, prg={exported_output['dtype']}"
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python
import os, cloudpickle, tempfile, unittest, subprocess
from extra.helpers import enable_early_exec, cross_process, _CloudpickleFunctionWrapper
def normalize_line_endings(s): return s.replace(b'\r\n', b'\n')
class TestEarlyExec(unittest.TestCase):
def setUp(self) -> None:
self.early_exec = enable_early_exec()
def early_exec_py_file(self, file_content, exec_args):
with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp:
temp.write(file_content)
temp_path = temp.name
try:
output = self.early_exec((["python3", temp_path] + exec_args, None))
return output
finally:
os.remove(temp_path)
def test_enable_early_exec(self):
output = self.early_exec_py_file(b'print("Hello, world!")', [])
self.assertEqual(b"Hello, world!\n", normalize_line_endings(output))
def test_enable_early_exec_with_arg(self):
output = self.early_exec_py_file(b'import sys\nprint("Hello, " + sys.argv[1] + "!")', ["world"])
self.assertEqual(b"Hello, world!\n", normalize_line_endings(output))
def test_enable_early_exec_process_exception(self):
with self.assertRaises(subprocess.CalledProcessError):
self.early_exec_py_file(b'raise Exception("Test exception")', [])
def test_enable_early_exec_type_exception(self):
with self.assertRaises(TypeError):
self.early_exec((["python3"], "print('Hello, world!')"))
class TestCrossProcess(unittest.TestCase):
def test_cross_process(self):
def _iterate():
for i in range(10): yield i
results = list(cross_process(_iterate))
self.assertEqual(list(range(10)), results)
def test_cross_process_exception(self):
def _iterate():
for i in range(10):
if i == 5: raise ValueError("Test exception")
yield i
with self.assertRaises(ValueError): list(cross_process(_iterate))
def test_CloudpickleFunctionWrapper(self):
def add(x, y): return x + y
self.assertEqual(7, cloudpickle.loads(cloudpickle.dumps(_CloudpickleFunctionWrapper(add)))(3, 4))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,107 @@
import numpy as np
import torch
import unittest
from tinygrad.tensor import Tensor
from tinygrad.nn.state import get_parameters
from tinygrad.nn.optim import Adam
from extra.lr_scheduler import MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR, OneCycleLR
from extra.training import train, evaluate
from extra.datasets import fetch_mnist
import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu]
np.random.seed(1337)
Tensor.manual_seed(1337)
X_train, Y_train, X_test, Y_test = fetch_mnist()
class TinyBobNet:
def __init__(self):
self.l1 = Tensor.scaled_uniform(784, 128)
self.l2 = Tensor.scaled_uniform(128, 10)
def parameters(self):
return get_parameters(self)
def forward(self, x):
return x.dot(self.l1).relu().dot(self.l2).log_softmax()
def lr_scheduler_training(sched_fn=None, args=None):
model = TinyBobNet()
optim = Adam(model.parameters(), lr=0.01)
if sched_fn is not None: sched = sched_fn(optim, **args)
for _ in range(25):
train(model, X_train, Y_train, optim, 100)
if sched_fn is not None:
if isinstance(sched, ReduceLROnPlateau):
sched.step(evaluate(model, X_test, Y_test))
else:
sched.step()
return evaluate(model, X_test, Y_test)
def current_lr(optim): return optim.param_groups[0]['lr'] if hasattr(optim, 'param_groups') else optim.lr
def get_lrs(optim, sched, epochs, steps=1, accs=None):
lr = current_lr(optim)
if not isinstance(lr, float): lr = lr.numpy()[0]
lrs = [lr]
for e in range(epochs):
for _ in range(steps):
optim.step()
sched.step() if accs is None else sched.step(accs[e])
lr = current_lr(optim)
if not isinstance(lr, float): lr = lr.numpy()[0]
lrs.append(lr)
return lrs
class TestLrScheduler(unittest.TestCase):
def _test_lr_scheduler(self, tinygrad_sched, torch_sched, epochs, opts, atol, rtol):
accs = opts.pop('accs', None)
tinygrad_optim, torch_optim = Adam([], lr=0.01), torch.optim.Adam([torch.tensor([0.], requires_grad=True)], lr=0.01)
tinygrad_sched, torch_sched = tinygrad_sched(tinygrad_optim, **opts), torch_sched(torch_optim, **opts)
tinygrad_lrs = get_lrs(tinygrad_optim, tinygrad_sched, epochs, accs=accs)
torch_lrs = get_lrs(torch_optim, torch_sched, epochs, accs=accs)
np.testing.assert_allclose(tinygrad_lrs, torch_lrs, atol=atol, rtol=rtol)
def _test_multisteplr(self, epochs, opts, atol, rtol):
self._test_lr_scheduler(MultiStepLR, torch.optim.lr_scheduler.MultiStepLR, epochs, opts, atol, rtol)
def _test_reducelronplateau(self, epochs, opts, atol, rtol):
opts['accs'] = np.random.randn(epochs)
self._test_lr_scheduler(ReduceLROnPlateau, torch.optim.lr_scheduler.ReduceLROnPlateau, epochs, opts, atol, rtol)
def _test_cosineannealinglr(self, epochs, opts, atol, rtol):
opts['T_max'] = epochs
self._test_lr_scheduler(CosineAnnealingLR, torch.optim.lr_scheduler.CosineAnnealingLR, epochs, opts, atol, rtol)
def _test_onecyclelr(self, epochs, opts, atol, rtol):
opts['total_steps'] = epochs
self._test_lr_scheduler(OneCycleLR, torch.optim.lr_scheduler.OneCycleLR, epochs, opts, atol, rtol)
def test_multisteplr(self): self._test_multisteplr(10, {'milestones': [1, 2, 7]}, 1e-6, 1e-6)
def test_multisteplr_gamma(self): self._test_multisteplr(10, {'milestones': [1, 2, 7], 'gamma': 0.1337}, 1e-6, 1e-6)
def test_reducelronplateau(self): self._test_reducelronplateau(100, {}, 1e-6, 1e-6)
def test_reducelronplateau_max(self): self._test_reducelronplateau(100, {'mode': 'max'}, 1e-6, 1e-6)
def test_reducelronplateau_factor(self): self._test_reducelronplateau(100, {'factor': 0.1337}, 1e-6, 1e-6)
def test_reducelronplateau_patience(self): self._test_reducelronplateau(100, {'patience': 3}, 1e-6, 1e-6)
def test_reducelronplateau_threshold(self): self._test_reducelronplateau(100, {'threshold': 1e-6}, 1e-6, 1e-6)
def test_reducelronplateau_threshold_mode(self): self._test_reducelronplateau(100, {'threshold_mode': 'abs'}, 1e-6, 1e-6)
def test_cosineannealinglr(self): self._test_cosineannealinglr(100, {}, 1e-6, 1e-6)
def test_cosineannealinglr_eta_min(self): self._test_cosineannealinglr(100, {'eta_min': 0.001}, 1e-6, 1e-6)
def test_onecyclelr(self): self._test_onecyclelr(1000, {'pct_start': 0.3, 'anneal_strategy': 'linear',
'cycle_momentum': False, 'div_factor': 25.0,
'final_div_factor': 10000.0, 'max_lr':1e-5}, 1e-6, 1e-6)
@unittest.skip("slow")
def test_training(self):
without = lr_scheduler_training()
sched_fns = [MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR, OneCycleLR]
argss = [{'milestones': [5, 7, 10, 15], 'gamma': 0.5}, {'factor': 0.5, 'patience': 2}, {'T_max': 25, 'eta_min': 0.001},
{'pct_start': 0.3, 'anneal_strategy': 'linear', 'cycle_momentum': False, 'div_factor': 25.0, 'final_div_factor': 10000.0, 'max_lr':1e-5, 'total_steps': 25}]
for sched_fn, args in zip(sched_fns, argss):
with_sched = lr_scheduler_training(sched_fn, args)
assert with_sched > without
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python
import io, unittest
import os
import tempfile
from unittest.mock import patch, MagicMock
import torch
import numpy as np
from tinygrad.helpers import CI
from extra.utils import fetch, temp, download_file
from tinygrad.nn.state import torch_load
from PIL import Image
@unittest.skipIf(CI, "no internet tests in CI")
class TestFetch(unittest.TestCase):
def test_fetch_bad_http(self):
self.assertRaises(AssertionError, fetch, 'http://httpstat.us/500')
self.assertRaises(AssertionError, fetch, 'http://httpstat.us/404')
self.assertRaises(AssertionError, fetch, 'http://httpstat.us/400')
def test_fetch_small(self):
assert(len(fetch('https://google.com'))>0)
def test_fetch_img(self):
img = fetch("https://media.istockphoto.com/photos/hen-picture-id831791190")
pimg = Image.open(io.BytesIO(img))
assert pimg.size == (705, 1024)
class TestFetchRelative(unittest.TestCase):
def setUp(self):
self.working_dir = os.getcwd()
self.tempdir = tempfile.TemporaryDirectory()
os.chdir(self.tempdir.name)
with open('test_file.txt', 'x') as f:
f.write("12345")
def tearDown(self):
os.chdir(self.working_dir)
self.tempdir.cleanup()
#test ./
def test_fetch_relative_dotslash(self):
self.assertEqual(b'12345', fetch("./test_file.txt"))
#test ../
def test_fetch_relative_dotdotslash(self):
os.mkdir('test_file_path')
os.chdir('test_file_path')
self.assertEqual(b'12345', fetch("../test_file.txt"))
class TestDownloadFile(unittest.TestCase):
def setUp(self):
from pathlib import Path
self.test_file = Path(temp("test_download_file/test_file.txt"))
def tearDown(self):
os.remove(self.test_file)
os.removedirs(self.test_file.parent)
@patch('requests.get')
def test_download_file_with_mkdir(self, mock_requests):
mock_response = MagicMock()
mock_response.iter_content.return_value = [b'1234', b'5678']
mock_response.status_code = 200
mock_response.headers = {'content-length': '8'}
mock_requests.return_value = mock_response
self.assertFalse(self.test_file.parent.exists())
download_file("https://www.mock.com/fake.txt", self.test_file, skip_if_exists=False)
self.assertTrue(self.test_file.parent.exists())
self.assertTrue(self.test_file.is_file())
self.assertEqual('12345678', self.test_file.read_text())
class TestUtils(unittest.TestCase):
def test_fake_torch_load_zipped(self): self._test_fake_torch_load_zipped()
def test_fake_torch_load_zipped_float16(self): self._test_fake_torch_load_zipped(isfloat16=True)
def _test_fake_torch_load_zipped(self, isfloat16=False):
class LayerWithOffset(torch.nn.Module):
def __init__(self):
super(LayerWithOffset, self).__init__()
d = torch.randn(16)
self.param1 = torch.nn.Parameter(
d.as_strided([2, 2], [1, 2], storage_offset=5)
)
self.param2 = torch.nn.Parameter(
d.as_strided([2, 2], [1, 2], storage_offset=4)
)
model = torch.nn.Sequential(
torch.nn.Linear(4, 8),
torch.nn.Linear(8, 3),
LayerWithOffset()
)
if isfloat16: model = model.half()
path = temp(f"test_load_{isfloat16}.pt")
torch.save(model.state_dict(), path)
model2 = torch_load(path)
for name, a in model.state_dict().items():
b = model2[name]
a, b = a.numpy(), b.numpy()
assert a.shape == b.shape
assert a.dtype == b.dtype
assert np.array_equal(a, b)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,15 @@
from tinygrad.ops import LazyOp, LoadOps
from tinygrad.nn.state import get_parameters
# for speed
def derandomize(x):
if isinstance(x, LazyOp):
new_op = LoadOps.EMPTY if x.op == LoadOps.RAND else x.op
return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), x.arg)
x.op = derandomize(x.op)
return x
def derandomize_model(model):
for p in get_parameters(model):
p.lazydata = derandomize(p.lazydata)
p.realize()

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.9 KiB

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
import torch
def get_question_samp(bsz, seq_len, vocab_size, seed):
np.random.seed(seed)
in_ids= np.random.randint(vocab_size, size=(bsz, seq_len))
mask = np.random.choice([True, False], size=(bsz, seq_len))
seg_ids = np.random.randint(1, size=(bsz, seq_len))
return in_ids, mask, seg_ids
def set_equal_weights(mdl, torch_mdl):
from tinygrad.nn.state import get_state_dict
state, torch_state = get_state_dict(mdl), torch_mdl.state_dict()
assert len(state) == len(torch_state)
for k, v in state.items():
assert k in torch_state
torch_state[k].copy_(torch.from_numpy(v.numpy()))
torch_mdl.eval()
class TestBert(unittest.TestCase):
def test_questions(self):
from models.bert import BertForQuestionAnswering
from transformers import BertForQuestionAnswering as TorchBertForQuestionAnswering
from transformers import BertConfig
# small
config = {
'vocab_size':24, 'hidden_size':2, 'num_hidden_layers':2, 'num_attention_heads':2,
'intermediate_size':32, 'hidden_dropout_prob':0.1, 'attention_probs_dropout_prob':0.1,
'max_position_embeddings':512, 'type_vocab_size':2
}
# Create in tinygrad
Tensor.manual_seed(1337)
mdl = BertForQuestionAnswering(**config)
# Create in torch
with torch.no_grad():
torch_mdl = TorchBertForQuestionAnswering(BertConfig(**config))
set_equal_weights(mdl, torch_mdl)
seeds = (1337, 3141)
bsz, seq_len = 1, 16
for _, seed in enumerate(seeds):
in_ids, mask, seg_ids = get_question_samp(bsz, seq_len, config['vocab_size'], seed)
out = mdl(Tensor(in_ids), Tensor(mask), Tensor(seg_ids))
torch_out = torch_mdl.forward(torch.from_numpy(in_ids).long(), torch.from_numpy(mask), torch.from_numpy(seg_ids).long())[:2]
torch_out = torch.cat(torch_out).unsqueeze(2)
np.testing.assert_allclose(out.numpy(), torch_out.detach().numpy(), atol=5e-4, rtol=5e-4)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,115 @@
import ast
import pathlib
import sys
import unittest
import numpy as np
from PIL import Image
from tinygrad.helpers import getenv
from tinygrad.tensor import Tensor
from models.efficientnet import EfficientNet
from models.vit import ViT
from models.resnet import ResNet50
def _load_labels():
labels_filename = pathlib.Path(__file__).parent / 'efficientnet/imagenet1000_clsidx_to_labels.txt'
return ast.literal_eval(labels_filename.read_text())
_LABELS = _load_labels()
def preprocess(img, new=False):
# preprocess image
aspect_ratio = img.size[0] / img.size[1]
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
img = np.array(img)
y0, x0 =(np.asarray(img.shape)[:2] - 224) // 2
img = img[y0: y0 + 224, x0: x0 + 224]
# low level preprocess
if new:
img = img.astype(np.float32)
img -= [127.0, 127.0, 127.0]
img /= [128.0, 128.0, 128.0]
img = img[None]
else:
img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
img /= 255.0
img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
return img
def _infer(model: EfficientNet, img, bs=1):
Tensor.training = False
img = preprocess(img)
# run the net
if bs > 1: img = img.repeat(bs, axis=0)
out = model.forward(Tensor(img)).cpu()
return _LABELS[np.argmax(out.numpy()[0])]
chicken_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg')
car_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg')
class TestEfficientNet(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = EfficientNet(number=getenv("NUM"))
cls.model.load_from_pretrained()
@classmethod
def tearDownClass(cls):
del cls.model
def test_chicken(self):
label = _infer(self.model, chicken_img)
self.assertEqual(label, "hen")
def test_chicken_bigbatch(self):
label = _infer(self.model, chicken_img, 2)
self.assertEqual(label, "hen")
def test_car(self):
label = _infer(self.model, car_img)
self.assertEqual(label, "sports car, sport car")
class TestViT(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = ViT()
cls.model.load_from_pretrained()
@classmethod
def tearDownClass(cls):
del cls.model
def test_chicken(self):
label = _infer(self.model, chicken_img)
self.assertEqual(label, "cock")
def test_car(self):
label = _infer(self.model, car_img)
self.assertEqual(label, "racer, race car, racing car")
class TestResNet(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = ResNet50()
cls.model.load_from_pretrained()
@classmethod
def tearDownClass(cls):
del cls.model
def test_chicken(self):
label = _infer(self.model, chicken_img)
self.assertEqual(label, "hen")
def test_car(self):
label = _infer(self.model, car_img)
self.assertEqual(label, "sports car, sport car")
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,165 @@
import torch
from torch import nn
import unittest
import numpy as np
from tinygrad.nn.state import get_parameters, get_state_dict
from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
from tinygrad.tensor import Tensor
from extra.datasets import fetch_mnist
from tinygrad.helpers import CI
def compare_tiny_torch(model, model_torch, X, Y):
with Tensor.train():
model_torch.train()
model_state_dict = get_state_dict(model)
for k,v in model_torch.named_parameters():
if not CI: print(f"initting {k} from torch")
model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
optimizer = optim.SGD(get_parameters(model), lr=0.001)
optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.001)
Xt = torch.Tensor(X.numpy())
np.testing.assert_allclose(X.numpy(), Xt.detach().numpy())
out = model(X)
loss = (out * Y).mean()
if not CI: print(loss.realize().numpy())
out_torch = model_torch(torch.Tensor(X.numpy()))
loss_torch = (out_torch * torch.Tensor(Y.numpy())).mean()
if not CI: print(loss_torch.detach().numpy())
# assert losses match
np.testing.assert_allclose(loss.realize().numpy(), loss_torch.detach().numpy(), atol=1e-4)
# zero and backward
optimizer.zero_grad()
loss.backward()
optimizer_torch.zero_grad()
loss_torch.backward()
for k,v in list(model_torch.named_parameters())[::-1]:
g = model_state_dict[k].grad.numpy()
gt = v.grad.detach().numpy()
if not CI: print("testing grads", k)
np.testing.assert_allclose(g, gt, atol=1e-3, err_msg=f'grad mismatch {k}')
# take the steps
optimizer.step()
optimizer_torch.step()
# assert weights match (they don't!)
for k,v in model_torch.named_parameters():
if not CI: print("testing weight", k)
np.testing.assert_allclose(model_state_dict[k].numpy(), v.detach().numpy(), atol=1e-3, err_msg=f'weight mismatch {k}')
def get_mnist_data():
X_train, Y_train, X_test, Y_test = fetch_mnist()
BS = 32
num_classes = 10
X = Tensor(X_test[0:BS].astype(np.float32))
Y = np.zeros((BS, num_classes), np.float32)
Y[range(BS),Y_test[0:BS]] = -1.0*num_classes
return X, Tensor(Y)
class TestEnd2End(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.X, cls.Y = get_mnist_data()
def setUp(self):
torch.manual_seed(123)
def test_linear_mnist(self):
class LinTiny:
def __init__(self, has_batchnorm=False):
self.l1 = Linear(784, 128)
self.l2 = Linear(128, 10)
self.bn1 = BatchNorm2d(128) if has_batchnorm else lambda x: x
def __call__(self, x):
return self.l2(self.l1(x)).relu().log_softmax(-1)
class LinTorch(nn.Module):
def __init__(self, has_batchnorm=False):
super().__init__()
self.l1 = nn.Linear(784, 128)
self.l2 = nn.Linear(128, 10)
def forward(self, x):
return self.l2(self.l1(x)).relu().log_softmax(-1)
compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
def test_bn_mnist(self):
class LinTiny:
def __init__(self):
self.l1 = Linear(784, 128)
self.l2 = Linear(128, 10)
self.bn1 = BatchNorm2d(128)
def __call__(self, x):
return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
class LinTorch(nn.Module):
def __init__(self):
super().__init__()
self.l1 = nn.Linear(784, 128)
self.l2 = nn.Linear(128, 10)
self.bn1 = nn.BatchNorm2d(128)
def forward(self, x):
return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
def test_bn_alone(self):
np.random.seed(1337)
X = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
Y = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
compare_tiny_torch(BatchNorm2d(10), nn.BatchNorm2d(10), X, Y)
def test_bn_linear(self):
BS, K = 2, 1
eps = 0
X = Tensor([1,0]).reshape(BS, K, 1, 1)
Y = Tensor([-1,0]).reshape(BS, K, 1, 1)
class LinTiny:
def __init__(self):
self.l1 = Conv2d(K, K, 1, bias=False)
self.bn1 = BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
def __call__(self, x): return self.bn1(self.l1(x))
class LinTorch(nn.Module):
def __init__(self):
super().__init__()
self.l1 = nn.Conv2d(K, K, 1, bias=False)
self.bn1 = nn.BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
def forward(self, x): return self.bn1(self.l1(x))
model_torch = LinTorch()
with torch.no_grad():
model_torch.l1.weight[:] = 1.
compare_tiny_torch(LinTiny(), model_torch, X, Y)
def test_conv_mnist(self):
class LinTiny:
def __init__(self, has_batchnorm=False):
self.c1 = Conv2d(1, 8, 3, stride=2)
self.c2 = Conv2d(8, 16, 3, stride=2)
self.l1 = Linear(16*6*6, 10)
if has_batchnorm:
self.bn1, self.bn2 = BatchNorm2d(8), BatchNorm2d(16)
else:
self.bn1, self.bn2 = lambda x: x, lambda x: x
def __call__(self, x):
return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
class LinTorch(nn.Module):
def __init__(self, has_batchnorm=False):
super().__init__()
self.c1 = nn.Conv2d(1, 8, 3, stride=2)
self.c2 = nn.Conv2d(8, 16, 3, stride=2)
self.l1 = nn.Linear(16*6*6, 10)
if has_batchnorm:
self.bn1, self.bn2 = nn.BatchNorm2d(8), nn.BatchNorm2d(16)
else:
self.bn1, self.bn2 = lambda x: x, lambda x: x
def forward(self, x):
return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
for has_batchnorm in [False, True]:
with self.subTest(has_batchnorm=has_batchnorm):
compare_tiny_torch(LinTiny(has_batchnorm), LinTorch(has_batchnorm), self.X.reshape((-1, 1, 28, 28)), self.Y)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.nn.state import get_parameters
from tinygrad.tensor import Tensor, Device
from tinygrad.nn import optim, BatchNorm2d
from extra.training import train, evaluate
from extra.datasets import fetch_mnist
import pytest
pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
# load the mnist dataset
X_train, Y_train, X_test, Y_test = fetch_mnist()
# create a model
class TinyBobNet:
def __init__(self):
self.l1 = Tensor.scaled_uniform(784, 128)
self.l2 = Tensor.scaled_uniform(128, 10)
def parameters(self):
return get_parameters(self)
def forward(self, x):
return x.dot(self.l1).relu().dot(self.l2).log_softmax()
# create a model with a conv layer
class TinyConvNet:
def __init__(self, has_batchnorm=False):
# https://keras.io/examples/vision/mnist_convnet/
conv = 3
#inter_chan, out_chan = 32, 64
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)
if has_batchnorm:
self.bn1 = BatchNorm2d(inter_chan)
self.bn2 = BatchNorm2d(out_chan)
else:
self.bn1, self.bn2 = lambda x: x, lambda x: x
def parameters(self):
return get_parameters(self)
def forward(self, x:Tensor):
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
x = self.bn1(x.conv2d(self.c1)).relu().max_pool2d()
x = self.bn2(x.conv2d(self.c2)).relu().max_pool2d()
x = x.reshape(shape=[x.shape[0], -1])
return x.dot(self.l1).log_softmax()
class TestMNIST(unittest.TestCase):
def test_sgd_onestep(self):
np.random.seed(1337)
model = TinyBobNet()
optimizer = optim.SGD(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, BS=69, steps=1)
for p in model.parameters(): p.realize()
def test_sgd_threestep(self):
np.random.seed(1337)
model = TinyBobNet()
optimizer = optim.SGD(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, BS=69, steps=3)
def test_sgd_sixstep(self):
np.random.seed(1337)
model = TinyBobNet()
optimizer = optim.SGD(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, BS=69, steps=6, noloss=True)
def test_adam_onestep(self):
np.random.seed(1337)
model = TinyBobNet()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, BS=69, steps=1)
for p in model.parameters(): p.realize()
def test_adam_threestep(self):
np.random.seed(1337)
model = TinyBobNet()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, BS=69, steps=3)
def test_conv_onestep(self):
np.random.seed(1337)
model = TinyConvNet()
optimizer = optim.SGD(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, BS=69, steps=1, noloss=True)
for p in model.parameters(): p.realize()
def test_conv(self):
np.random.seed(1337)
model = TinyConvNet()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, steps=100)
assert evaluate(model, X_test, Y_test) > 0.93 # torch gets 0.9415 sometimes
def test_conv_with_bn(self):
np.random.seed(1337)
model = TinyConvNet(has_batchnorm=True)
optimizer = optim.AdamW(model.parameters(), lr=0.003)
train(model, X_train, Y_train, optimizer, steps=200)
assert evaluate(model, X_test, Y_test) > 0.94
def test_sgd(self):
np.random.seed(1337)
model = TinyBobNet()
optimizer = optim.SGD(model.parameters(), lr=0.001)
train(model, X_train, Y_train, optimizer, steps=600)
assert evaluate(model, X_test, Y_test) > 0.94 # CPU gets 0.9494 sometimes
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python
import os
import time
import io
import unittest
import numpy as np
import onnx
from extra.utils import fetch, temp
from extra.onnx import get_run_onnx
from tinygrad.tensor import Tensor
from tinygrad.helpers import CI
import pytest
pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
def run_onnx_torch(onnx_model, inputs):
import torch
from onnx2torch import convert
torch_model = convert(onnx_model).float()
with torch.no_grad():
torch_out = torch_model(*[torch.tensor(x) for x in inputs.values()])
return torch_out
OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx"
np.random.seed(1337)
class TestOnnxModel(unittest.TestCase):
def test_benchmark_openpilot_model(self):
dat = fetch(OPENPILOT_MODEL)
onnx_model = onnx.load(io.BytesIO(dat))
run_onnx = get_run_onnx(onnx_model)
def get_inputs():
np_inputs = {
"input_imgs": np.random.randn(*(1, 12, 128, 256)),
"big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
"desire": np.zeros((1, 100, 8)),
"traffic_convention": np.array([[1., 0.]]),
"nav_features": np.zeros((1, 256)),
"features_buffer": np.zeros((1, 99, 128)),
}
inputs = {k:Tensor(v.astype(np.float32), requires_grad=False) for k,v in np_inputs.items()}
return inputs
for _ in range(7):
inputs = get_inputs()
st = time.monotonic()
tinygrad_out = run_onnx(inputs)['outputs']
mt = time.monotonic()
tinygrad_out.realize()
mt2 = time.monotonic()
tinygrad_out = tinygrad_out.numpy()
et = time.monotonic()
if not CI: print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
if not CI:
import cProfile
import pstats
inputs = get_inputs()
pr = cProfile.Profile(timer=time.perf_counter_ns, timeunit=1e-6)
pr.enable()
tinygrad_out = run_onnx(inputs)['outputs']
tinygrad_out.realize()
tinygrad_out = tinygrad_out.numpy()
if not CI:
pr.disable()
stats = pstats.Stats(pr)
stats.dump_stats(temp("net.prof"))
os.system(f"flameprof {temp('net.prof')} > {temp('prof.svg')}")
ps = stats.sort_stats(pstats.SortKey.TIME)
ps.print_stats(30)
def test_openpilot_model(self):
dat = fetch(OPENPILOT_MODEL)
onnx_model = onnx.load(io.BytesIO(dat))
run_onnx = get_run_onnx(onnx_model)
print("got run_onnx")
inputs = {
"input_imgs": np.random.randn(*(1, 12, 128, 256)),
"big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
"desire": np.zeros((1, 100, 8)),
"traffic_convention": np.array([[1., 0.]]),
"nav_features": np.zeros((1, 256)),
"features_buffer": np.zeros((1, 99, 128)),
}
inputs = {k:v.astype(np.float32) for k,v in inputs.items()}
st = time.monotonic()
print("****** run onnx ******")
tinygrad_out = run_onnx(inputs)['outputs']
mt = time.monotonic()
print("****** realize ******")
tinygrad_out.realize()
mt2 = time.monotonic()
tinygrad_out = tinygrad_out.numpy()
et = time.monotonic()
print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
Tensor.no_grad = True
torch_out = run_onnx_torch(onnx_model, inputs).numpy()
Tensor.no_grad = False
print(tinygrad_out, torch_out)
np.testing.assert_allclose(torch_out, tinygrad_out, atol=1e-4, rtol=1e-2)
def test_efficientnet(self):
dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx")
input_name, input_new = "images:0", True
self._test_model(dat, input_name, input_new)
def test_shufflenet(self):
dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx")
print(f"shufflenet downloaded : {len(dat)/1e6:.2f} MB")
input_name, input_new = "gpu_0/data_0", False
self._test_model(dat, input_name, input_new)
@unittest.skip("test is very slow")
def test_resnet(self):
# NOTE: many onnx models can't be run right now due to max pool with strides != kernel_size
dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx")
print(f"resnet downloaded : {len(dat)/1e6:.2f} MB")
input_name, input_new = "data", False
self._test_model(dat, input_name, input_new)
def _test_model(self, dat, input_name, input_new, debug=False):
onnx_model = onnx.load(io.BytesIO(dat))
print("onnx loaded")
from test.models.test_efficientnet import chicken_img, car_img, preprocess, _LABELS
run_onnx = get_run_onnx(onnx_model)
def run(img):
inputs = {input_name: preprocess(img, new=input_new)}
tinygrad_out = list(run_onnx(inputs, debug=debug).values())[0].numpy()
return tinygrad_out.argmax()
cls = run(chicken_img)
print(cls, _LABELS[cls])
assert _LABELS[cls] == "hen" or _LABELS[cls] == "cock"
cls = run(car_img)
print(cls, _LABELS[cls])
assert "car" in _LABELS[cls] or _LABELS[cls] == "convertible"
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,100 @@
import unittest, time
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.nn import optim
from tinygrad.nn.state import get_parameters
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
from tinygrad.ops import Device, GlobalCounters
from tinygrad.helpers import CI, dtypes, getenv, prod
from test.helpers import derandomize_model
from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
from examples.hlb_cifar10 import SpeedyResNet
from examples.llama import Transformer as LLaMaTransformer, MODEL_PARAMS as LLAMA_MODEL_PARAMS
from examples.stable_diffusion import UNetModel
def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed, all_jitted=False):
tms = []
for _ in range(4):
GlobalCounters.reset()
GlobalCounters.mem_used = 0
Device[Device.DEFAULT].synchronize()
st = time.perf_counter_ns()
train(*gen())
Device[Device.DEFAULT].synchronize()
tms.append(time.perf_counter_ns() - st)
kernels_used = len(train.jit_cache) if hasattr(train, "jit_cache") else None
print(f"{nm}: used {GlobalCounters.mem_used/1e9:.2f} GB and {kernels_used} kernels in {min(tms)/1e6:.2f} ms")
assert GlobalCounters.mem_used/1e9 < max_memory_allowed, f"{nm} used more than {max_memory_allowed:.2f} GB"
assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels"
if all_jitted:
assert kernels_used > 0 and kernels_used == GlobalCounters.kernel_count, f"only {kernels_used} out of {GlobalCounters.kernel_count} were jitted"
class TestRealWorld(unittest.TestCase):
def setUp(self):
self.old_type = Tensor.default_type
np.random.seed(2002)
def tearDown(self):
Tensor.default_type = self.old_type
@unittest.skipUnless(not CI, "too big for CI")
def test_stable_diffusion(self):
model = UNetModel()
derandomize_model(model)
@TinyJit
def test(t, t2): return model(t, 801, t2).realize()
helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, 768)), test, 18.0, 967)
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
def test_llama(self):
Tensor.default_type = dtypes.float16
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
model = LLaMaTransformer(**(args_tiny if CI else LLAMA_MODEL_PARAMS["1"]["7B"]["args"]))
derandomize_model(model)
@TinyJit
def test(t): return model(t, 0).realize()
# NOTE: only test one pass, not testing the dynamic shape autoregressive part
helper_test("test_llama", lambda: (Tensor([[1,]]),), test, 0.22 if CI else 13.5, 126 if CI else 486, all_jitted=True)
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM"] or not CI), "needs JIT, too long on CI LLVM")
def test_gpt2(self):
Tensor.default_type = dtypes.float16
args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000}
model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"]))
derandomize_model(model)
@TinyJit
def test(t): return model(t, 0).realize()
helper_test("test_gpt2", lambda: (Tensor([[1,]]),), test, 0.21 if CI else 0.9, 129 if CI else 369, all_jitted=True)
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM", "CLANG"] or not CI), "needs JIT, too long on CI LLVM and CLANG")
def test_train_cifar(self):
# TODO: with default device
#old_default = Device.DEFAULT
#Device.DEFAULT = "FAKE"
#Device['fake'].codegen = Device[old_default].codegen
with Tensor.train():
model = SpeedyResNet(Tensor.ones((12,3,2,2)))
optimizer = optim.SGD(get_parameters(model), lr=0.01, momentum=0.8, nesterov=True, weight_decay=0.15)
BS = 32 if CI else 512
@TinyJit
def train(X):
out = model(X)
loss = out.mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
helper_test("train_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), train, (1.0/48)*BS, 154) # it's 154 on metal
# reset device
#Device.DEFAULT = old_default
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from models.rnnt import LSTM
import torch
class TestRNNT(unittest.TestCase):
def test_lstm(self):
BS, SQ, IS, HS, L = 2, 20, 40, 128, 2
# create in torch
with torch.no_grad():
torch_layer = torch.nn.LSTM(IS, HS, L)
# create in tinygrad
layer = LSTM(IS, HS, L, 0.0)
# copy weights
with torch.no_grad():
layer.cells[0].weights_ih.assign(Tensor(torch_layer.weight_ih_l0.numpy()))
layer.cells[0].weights_hh.assign(Tensor(torch_layer.weight_hh_l0.numpy()))
layer.cells[0].bias_ih.assign(Tensor(torch_layer.bias_ih_l0.numpy()))
layer.cells[0].bias_hh.assign(Tensor(torch_layer.bias_hh_l0.numpy()))
layer.cells[1].weights_ih.assign(Tensor(torch_layer.weight_ih_l1.numpy()))
layer.cells[1].weights_hh.assign(Tensor(torch_layer.weight_hh_l1.numpy()))
layer.cells[1].bias_ih.assign(Tensor(torch_layer.bias_ih_l1.numpy()))
layer.cells[1].bias_hh.assign(Tensor(torch_layer.bias_hh_l1.numpy()))
# test initial hidden
for _ in range(3):
x = Tensor.randn(SQ, BS, IS)
z, hc = layer(x, None)
torch_x = torch.tensor(x.numpy())
torch_z, torch_hc = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
# test passing hidden
for _ in range(3):
x = Tensor.randn(SQ, BS, IS)
z, hc = layer(x, hc)
torch_x = torch.tensor(x.numpy())
torch_z, torch_hc = torch_layer(torch_x, torch_hc)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,83 @@
import unittest
import time
import numpy as np
from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim
from tinygrad.tensor import Device
from tinygrad.helpers import getenv
from extra.training import train
from models.convnext import ConvNeXt
from models.efficientnet import EfficientNet
from models.transformer import Transformer
from models.vit import ViT
from models.resnet import ResNet18
import pytest
pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
BS = getenv("BS", 2)
def train_one_step(model,X,Y):
params = get_parameters(model)
pcount = 0
for p in params:
pcount += np.prod(p.shape)
optimizer = optim.SGD(params, lr=0.001)
print("stepping %r with %.1fM params bs %d" % (type(model), pcount/1e6, BS))
st = time.time()
train(model, X, Y, optimizer, steps=1, BS=BS)
et = time.time()-st
print("done in %.2f ms" % (et*1000.))
def check_gc():
if Device.DEFAULT == "GPU":
from extra.introspection import print_objects
assert print_objects() == 0
class TestTrain(unittest.TestCase):
def test_convnext(self):
model = ConvNeXt(depths=[1], dims=[16])
X = np.zeros((BS,3,224,224), dtype=np.float32)
Y = np.zeros((BS), dtype=np.int32)
train_one_step(model,X,Y)
check_gc()
def test_efficientnet(self):
model = EfficientNet(0)
X = np.zeros((BS,3,224,224), dtype=np.float32)
Y = np.zeros((BS), dtype=np.int32)
train_one_step(model,X,Y)
check_gc()
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "too many buffers for webgpu")
def test_vit(self):
model = ViT()
X = np.zeros((BS,3,224,224), dtype=np.float32)
Y = np.zeros((BS,), dtype=np.int32)
train_one_step(model,X,Y)
check_gc()
def test_transformer(self):
# this should be small GPT-2, but the param count is wrong
# (real ff_dim is 768*4)
model = Transformer(syms=10, maxlen=6, layers=12, embed_dim=768, num_heads=12, ff_dim=768//4)
X = np.zeros((BS,6), dtype=np.float32)
Y = np.zeros((BS,6), dtype=np.int32)
train_one_step(model,X,Y)
check_gc()
def test_resnet(self):
X = np.zeros((BS, 3, 224, 224), dtype=np.float32)
Y = np.zeros((BS), dtype=np.int32)
for resnet_v in [ResNet18]:
model = resnet_v()
model.load_from_pretrained()
train_one_step(model, X, Y)
check_gc()
def test_bert(self):
# TODO: write this
pass
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env python
import pathlib
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
class TestVGG7(unittest.TestCase):
def test_vgg7(self):
from examples.vgg7_helpers.waifu2x import Vgg7, image_load
# Create in tinygrad
Tensor.manual_seed(1337)
mdl = Vgg7()
mdl.load_from_pretrained()
# Scale up an image
test_x = image_load(pathlib.Path(__file__).parent / 'waifu2x/input.png')
test_y = image_load(pathlib.Path(__file__).parent / 'waifu2x/output.png')
scaled = mdl.forward_tiled(test_x, 156)
scaled = np.fmax(0, np.fmin(1, scaled))
np.testing.assert_allclose(scaled, test_y, atol=5e-3, rtol=5e-3)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,25 @@
import unittest
import pathlib
from tinygrad.ops import Device
from examples.whisper import init_whisper, transcribe_file
@unittest.skipUnless(Device.DEFAULT == "METAL", "Some non-metal backends spend too long trying to allocate a 20GB array")
class TestWhisper(unittest.TestCase):
@classmethod
def setUpClass(cls):
model, enc = init_whisper("tiny.en")
cls.model = model
cls.enc = enc
@classmethod
def tearDownClass(cls):
del cls.model
del cls.enc
def test_transcribe_file(self):
# Audio generated with the command on MacOS:
# say "Could you please let me out of the box?" --file-format=WAVE --data-format=LEUI8@16000 -o test
# We use the WAVE type because it's easier to decode in CI test environments
filename = str(pathlib.Path(__file__).parent / "whisper/test.wav")
transcription = transcribe_file(self.model, self.enc, filename)
self.assertEqual("<|startoftranscript|><|notimestamps|> Could you please let me out of the box?<|endoftext|>", transcription)

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
import unittest
import numpy as np
from weakref import ref
from tinygrad.helpers import GlobalCounters
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
from tinygrad.helpers import dtypes, prod
from tinygrad.ops import Device
from tinygrad.tensor import Tensor
def check_gc():
if Device.DEFAULT == "GPU":
from extra.introspection import print_objects
assert print_objects() == 0
class FakeDeviceBuffer:
def __init__(self, sz, dt, device):
self.id = 1
self.size = sz
self.dtype = dt
self.device = device
def __del__(self):
assert self.id == 0, "Should called _do_free() before"
class FakeAllocator(LRUAllocator):
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
def _do_free(self, buf):
buf.id -= 1
assert buf.id == 0, f"Free should be called once, but {buf.id}"
def __del__(self): # Fake allocator should clear all buffers after each test.
for v in self.cached_buffers.values():
for buf, _ in v: self._free_buffer(buf)
FAKE_GLOBAL_ALLOCATOR = None
class FakeBuffer(RawBuffer):
def __init__(self, size, dtype, device='0'):
global FAKE_GLOBAL_ALLOCATOR
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
def alloc(allocator, size, dtype, **kwargs):
global FAKE_GLOBAL_ALLOCATOR
FAKE_GLOBAL_ALLOCATOR = allocator
buf = FakeBuffer(size, dtype, **kwargs)
assert buf.dtype == dtype and buf.size == size
FAKE_GLOBAL_ALLOCATOR = None
return buf
def alloc_free_trace(allocator, size, dtype, **kwargs):
buf = alloc(allocator, size, dtype, **kwargs)
return ref(buf._buf)
def cmp_trace_and_buf(buf, trace_ref): return trace_ref and trace_ref() == buf._buf
class TestAllocators(unittest.TestCase):
def test_lru_allocator_reusage(self):
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
def test():
lru_allocator = FakeAllocator(2048)
traced_buf = alloc_free_trace(lru_allocator, 16, dtypes.float32)
assert GlobalCounters.mem_cached - mc == 16*dtypes.float32.itemsize, "Buffer should be cached"
for _ in range(32):
def __test():
buf = alloc(lru_allocator, 16, dtypes.float32)
assert cmp_trace_and_buf(buf, traced_buf), "Buffer should be reused"
__test()
usedbuf = alloc(lru_allocator, 16, dtypes.float32)
for _ in range(32):
def __test():
buf = alloc(lru_allocator, 16, dtypes.float32)
assert usedbuf != buf, "Nobody should get used buffer"
__test()
assert GlobalCounters.mem_used - mu == 16*dtypes.float32.itemsize, "Only usedbuf is still allocated."
test()
check_gc()
def test_lru_allocator_cache_free(self):
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
def test():
lru_allocator = FakeAllocator(128)
refs = []
for _ in range(32):
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32))
for sz in range(1, 32):
alloc_free_trace(lru_allocator, sz, dtypes.float32)
assert GlobalCounters.mem_used + GlobalCounters.mem_cached - mc - mu <= 128, "Should not allocate on device more than allowed (128)"
for r in refs: assert r() is None, "All refs should be dead, since buffers were cleared from cache"
test()
check_gc()
def test_lru_allocator_multidevice(self):
def test():
lru_allocator = FakeAllocator(256)
refs=[]
for i in range(8):
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32, device=str(i)))
for i in range(64):
def __test():
dev = str(i % 8)
buf = alloc(lru_allocator, 16, dtypes.float32, device=dev)
assert cmp_trace_and_buf(buf, refs[i%8]), "Buffer should be reused"
__test()
for r in refs: assert r() is not None, "All refs should be cached"
test()
check_gc()
@unittest.skip("failing in CI")
def test_gpu_copyout(self):
def test():
from tinygrad.runtime.ops_gpu import CL
# Allocation to init the allocator.
tx = Tensor.rand(1)
tx.realize()
free_space = CL.cl_allocator.free_space[tx.lazydata.realized._device]
# Spawning 128mb objects to fill half of free_space
will_allocate = free_space // 3
trash_allocation_size = free_space // 2
def sp():
trash_buffer = Tensor.rand(trash_allocation_size // 4)
trash_buffer.realize()
sp()
xx = Tensor.rand(will_allocate // 4)
_ = xx.numpy()
test()
check_gc()
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
from tinygrad.helpers import dtypes
N = 200 # has to be bigger than the cache to fail
class TestAssign(unittest.TestCase):
def test_simple_assignment(self):
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
a.realize()
b.realize()
ba1 = a.lazydata.realized
bb1 = b.lazydata.realized
a += b
a.realize()
ba2 = a.lazydata.realized
assert ba1 == ba2 and ba1 != bb1
np.testing.assert_allclose(a.numpy(), (np.arange(N*N)*2).reshape((N,N)))
@unittest.skipIf(Device.DEFAULT == "CPU" or Device.DEFAULT == "TORCH", "questionable tests")
def test_permuted_assignment(self):
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
a.realize()
b.realize()
ba1 = a.lazydata.realized
bb1 = b.lazydata.realized
a = a.permute(1,0)
a += b
a.realize()
ba2 = a.lazydata.realized
assert ba1 != ba2 and ba1 != bb1
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
def test_post_permuted_assignment(self):
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
a.realize()
b.realize()
#GlobalCounters.cache = []
ba1 = a.lazydata.realized
bb1 = b.lazydata.realized
a.assign(a.permute(1,0) + b) # this should not work!
a.realize()
ba2 = a.lazydata.realized
# NOTE: don't test that it's assigned
#assert ba1 == ba2 and ba1 != bb1
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
# TODO: is there a way to sneak in a permute such that it returns the wrong answer?
def test_cast_assignment(self):
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
a.realize()
oba1 = a.lazydata.output_buffer
a.assign(a.cast(dtypes.int32).realize())
a.realize()
oba2 = a.lazydata.output_buffer
assert oba1 is None and oba2 is None
np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,147 @@
import unittest
import numpy as np
from tinygrad.tensor import Tensor, Device
import pytest
pytestmark = [pytest.mark.exclude_cuda]
class TestConv(unittest.TestCase):
def test_simple(self):
x = Tensor.ones(1,12,128,256).contiguous().realize()
w = Tensor.ones(32,12,3,3).contiguous().realize()
ret = x.conv2d(w, stride=(2,2), padding=(1,1)).numpy()
# it's not 108 around the padding
assert (ret[:, :, 1:-1, 1:-1] == 108).all()
assert ret[0,0,0,0] == 48
assert ret[0,0,0,1] == 72
def test_simple_rand(self):
x = Tensor.rand(1,12,128,256)
w = Tensor.rand(32,12,3,3)
ret = x.conv2d(w, stride=(2,2), padding=(1,1)).numpy()
def test_many_simple(self):
x = Tensor(np.arange(8*2*8).reshape(1,8,2,8).astype(np.float32))
#w = Tensor(np.arange(8*8*1*1).reshape(8,8,1,1).astype(np.float32))
w = Tensor.eye(8).reshape((8,8,1,1))
ret = x.conv2d(w, stride=(1,2), padding=(0,0)).numpy()
print(ret)
def test_lazycache(self):
Tensor.no_grad = True
x = Tensor.rand(1, 32)
y = Tensor.rand(32)
out = x + y.reshape((1,32,1)).reshape((1,32)) + y.reshape((1,32,1)).reshape((1,32))
out.numpy()
Tensor.no_grad = False
def test_simple_biased(self):
C = 8
x = Tensor.rand(1,C,5,5)
w = Tensor.eye(C).reshape((C,C,1,1))
b = Tensor(np.arange(C).astype(np.float32))
ret = Tensor.conv2d(x,w,b).relu().conv2d(w,b)
print(ret.numpy())
def test_two_binops_no_rerun(self):
Tensor.no_grad = True
x = Tensor.randn(1,12,128,256)
w = Tensor.randn(32,12,3,3)
out = x.conv2d(w, stride=(2,2), padding=(1,1))
r1, r2 = out.relu(), (out-1)
np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
np.testing.assert_allclose(r2.numpy(), out.numpy() - 1)
Tensor.no_grad = False
def test_two_overlapping_binops_no_rerun(self):
Tensor.no_grad = True
x = Tensor.randn(1,12,128,256)
w = Tensor.randn(32,12,3,3)
out = x.conv2d(w, stride=(2,2), padding=(1,1))
r1, r2 = out.relu(), out.elu()
np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5)
Tensor.no_grad = False
@unittest.skipIf(Device.DEFAULT != "TORCH", "Takes too long to compile for Compiled backends")
def test_two_overlapping_binops_no_rerun_wino(self):
Tensor.no_grad = True
old_wino = Tensor.wino
Tensor.wino = True
x = Tensor.randn(1,4,16,16)
w = Tensor.randn(6,4,3,3)
out = x.conv2d(w, padding=(1,1))
r1, r2 = out.relu(), out.elu()
np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5)
Tensor.wino = old_wino
Tensor.no_grad = False
def test_first_three(self):
Tensor.no_grad = True
x = Tensor.rand(1,12,128,256)
w = Tensor.rand(32,12,3,3)
x = x.conv2d(w, stride=(2,2), padding=(1,1)).elu()
w = Tensor.rand(32,1,3,3)
x = x.conv2d(w, padding=(1,1), groups=32).elu()
w = Tensor.rand(16,32,1,1)
x = x.conv2d(w).elu()
x = x.numpy()
print(x.shape)
Tensor.no_grad = False
def test_elu(self):
Tensor.no_grad = True
x = Tensor.rand(1,12,128,256)
w = Tensor.rand(32,12,3,3)
x = x.conv2d(w, stride=(2,2), padding=(1,1))
x = x.elu()
w = Tensor.rand(32,1,3,3)
x = x.conv2d(w, padding=(1,1), groups=32)
out = x.numpy()
Tensor.no_grad = False
def test_reduce_relu(self):
Tensor.no_grad = True
x = Tensor.rand(1,12,128,256)
x = x.sum(keepdim=True).relu()
out = x.numpy()
Tensor.no_grad = False
def test_bias(self):
Tensor.no_grad = True
from tinygrad.nn import Conv2d
x = Tensor.rand(1,12,128,256)
c = Conv2d(12, 32, 3)
x = c(x).relu()
w = Tensor.uniform(32, 1, 3, 3)
x = x.conv2d(w, groups=32)
out = x.numpy()
Tensor.no_grad = False
def test_multiadd(self):
w = Tensor.rand(32)
x = Tensor.rand(32).relu()
(w+x).numpy()
def test_reorder(self):
x = Tensor.rand(1,12,128,256)
w = Tensor.rand(12,12,3,3)
x = x.conv2d(w, padding=(1,1))
print(x.shape)
x = x.reshape((1, 12, 256, 128))
x += 1
x += 1
x = x.reshape((1, 12, 128, 256))
x.numpy()
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,27 @@
#!/usr/bin/env python
import unittest
from tinygrad.tensor import Tensor, Device
from tinygrad.nn import Conv2d
from tinygrad.jit import CacheCollector
import pytest
pytestmark = pytest.mark.webgpu
#@unittest.skipUnless(Device.DEFAULT == "GPU", "Only GPU supports cache")
@unittest.skip("with JIT changes, you only get the raw buffer")
class TestConvShapetracker(unittest.TestCase):
def test_conv_3x3_one_view(self):
inp = Tensor.randn(1,16,10,10).realize()
conv = Conv2d(16, 32, (3,3))
conv(inp).realize()
CacheCollector.start()
conv(inp).realize()
test = CacheCollector.finish()
assert len(test) == 1, f"conv should only have one kernel {[x[0].name for x in test]}"
print(test[0][0].prg)
for arg in test[0][1]:
print(arg.st)
assert len(arg.st.views) == 1
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,107 @@
# this is an example of how you can write terrible DSP compute breaking ops like warpPerspective
# here we use a CUSTOM op to write atan2
import unittest
import numpy as np
from typing import Optional, Tuple
from tinygrad.helpers import prod, dtypes
# *** first, we implement the atan2 op at the lowest level ***
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
from tinygrad.lazy import LazyBuffer, create_lazybuffer
from tinygrad.ops import ASTRunner, Device
from tinygrad.shape.shapetracker import ShapeTracker
import pytest
pytestmark = pytest.mark.webgpu
# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
ASTRunner("atan2_gpu", """
__kernel void atan2_gpu(global float *c, global float *a, global float *b) {
int idx = get_global_id(0);
c[idx] = atan2(a[idx], b[idx]);
}""", global_size=[prod(ret.shape)]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
return ret.realized
def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
return Device[ret.device].from_underlying(np.arctan2(a.realized._buf, b.realized._buf))
# *** second, we write the ATan2 mlop ***
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
# In general, it is also optional to write a backward function, just your backward pass won't work without it
from tinygrad.ops import LazyOp, LoadOps, BinaryOps, UnaryOps
from tinygrad.lazy import LazyBuffer
from tinygrad.tensor import Function
class ATan2(Function):
def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
self.a, self.b = a, b
ast = LazyOp(LoadOps.CUSTOM, (a.contiguous(), b.contiguous()), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), LoadOps, ast, max(a.dtype, b.dtype))
def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
denom = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b))
return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.SUB, self.a).e(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None
# *** third, we use our lovely new mlop in some tests ***
from tinygrad.tensor import Tensor
@unittest.skipUnless(Device.DEFAULT in ["CPU", "GPU"], "atan2 is only implemented for CPU and GPU")
class TestCustomFunction(unittest.TestCase):
def test_atan2_forward(self):
# create some random Tensors, permute them just because we can
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
# run the forward pass. note: up until the .numpy(), it's all lazy
c = ATan2.apply(a, b)
print(c.numpy())
# check the forward pass (in numpy)
np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
# fun fact, this never actually calls forward, so it works in all the backends
def test_atan2_backward(self):
# have to go forward before we can go backward
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
c = ATan2.apply(a, b)
# run the backward pass
c.mean().backward()
assert a.grad is not None and b.grad is not None, "tinygrad didn't compute gradients"
print(a.grad.numpy())
print(b.grad.numpy())
# check the backward pass (in torch)
import torch
ta, tb = torch.tensor(a.numpy(), requires_grad=True), torch.tensor(b.numpy(), requires_grad=True)
tc = torch.atan2(ta, tb)
tc.mean().backward()
assert ta.grad is not None and tb.grad is not None, "torch didn't compute gradients"
np.testing.assert_allclose(a.grad.numpy(), ta.grad.numpy(), atol=1e-5)
np.testing.assert_allclose(b.grad.numpy(), tb.grad.numpy(), atol=1e-5)
def test_atan2_jit(self):
# custom ops even work in the JIT!
from tinygrad.jit import TinyJit
@TinyJit
def jitted_atan2(a:Tensor, b:Tensor) -> Tensor:
return ATan2.apply(a, b).realize()
for _ in range(5):
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
c = jitted_atan2(a, b)
np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,182 @@
import unittest
import numpy as np
from tinygrad.helpers import CI, DTYPES_DICT, getenv, DType, DEBUG, ImageDType, PtrDType
from tinygrad.ops import Device
from tinygrad.tensor import Tensor, dtypes
from typing import Any, List
from extra.utils import OSX, temp
def is_dtype_supported(dtype: DType):
# for GPU, cl_khr_fp16 isn't supported (except now we don't need it!)
# for LLVM, it segfaults because it can't link to the casting function
if dtype == dtypes.half: return not (CI and Device.DEFAULT in ["GPU", "LLVM"]) and Device.DEFAULT != "WEBGPU" and getenv("CUDACPU") != 1
if dtype == dtypes.bfloat16: return False # numpy doesn't support bf16, tested separately in TestBFloat16DType
if dtype == dtypes.float64: return Device.DEFAULT not in ["WEBGPU", "METAL"] and not OSX
if dtype in [dtypes.int8, dtypes.uint8]: return Device.DEFAULT not in ["WEBGPU"]
if dtype in [dtypes.int16, dtypes.uint16]: return Device.DEFAULT not in ["WEBGPU", "TORCH"]
if dtype == dtypes.uint32: return Device.DEFAULT not in ["TORCH"]
if dtype in [dtypes.int64, dtypes.uint64]: return Device.DEFAULT not in ["WEBGPU", "TORCH"]
if dtype == dtypes.bool:
# host-shareablity is a requirement for storage buffers, but 'bool' type is not host-shareable
if Device.DEFAULT == "WEBGPU": return False
# TODO remove triton from here once internal casting is fixed. CAST of fp32s between 0-1 is broken in triton
if getenv("TRITON") == 1: return False
return True
def get_available_cast_dtypes(dtype: DType) -> List[DType]: return [v for k, v in DTYPES_DICT.items() if v != dtype and is_dtype_supported(v) and not k.startswith("_")] # dont cast internal dtypes
def _test_to_np(a:Tensor, np_dtype, target):
if DEBUG >= 2: print(a)
na = a.numpy()
if DEBUG >= 2: print(na, na.dtype, a.lazydata.realized)
try:
assert na.dtype == np_dtype
np.testing.assert_allclose(na, target)
except AssertionError as e:
raise AssertionError(f"\ntensor {a.numpy()} does not match target {target} with np_dtype {np_dtype}") from e
def _assert_eq(tensor:Tensor, target_dtype:DType, target):
if DEBUG >= 2: print(tensor.numpy())
try:
assert tensor.dtype == target_dtype
np.testing.assert_allclose(tensor.numpy(), target)
except AssertionError as e:
raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e
def _test_op(fxn, target_dtype:DType, target): _assert_eq(fxn(), target_dtype, target)
def _test_cast(a:Tensor, target_dtype:DType): _test_op(lambda: a.cast(target_dtype), target_dtype, a.numpy().astype(target_dtype.np).tolist())
def _test_bitcast(a:Tensor, target_dtype:DType, target): _test_op(lambda: a.bitcast(target_dtype), target_dtype, target)
class TestDType(unittest.TestCase):
DTYPE: Any = None
DATA: Any = None
@classmethod
def setUpClass(cls):
if not is_dtype_supported(cls.DTYPE): raise unittest.SkipTest("dtype not supported")
cls.DATA = np.random.randint(0, 100, size=10, dtype=cls.DTYPE.np).tolist() if dtypes.is_int(cls.DTYPE) else np.random.choice([True, False], size=10).tolist() if cls.DTYPE == dtypes.bool else np.random.uniform(0, 1, size=10).tolist()
def setUp(self):
if self.DTYPE is None: raise unittest.SkipTest("base class")
def test_to_np(self): _test_to_np(Tensor(self.DATA, dtype=self.DTYPE), self.DTYPE.np, np.array(self.DATA, dtype=self.DTYPE.np))
def test_casts_to(self): list(map(
lambda dtype: _test_cast(Tensor(self.DATA, dtype=dtype), self.DTYPE),
get_available_cast_dtypes(self.DTYPE)
))
def test_casts_from(self): list(map(
lambda dtype: _test_cast(Tensor(self.DATA, dtype=self.DTYPE), dtype),
get_available_cast_dtypes(self.DTYPE)
))
def test_upcast_ops(self): list(map(
lambda dtype: _test_ops(a_dtype=self.DTYPE, b_dtype=dtype, target_dtype=dtype) if dtype.sz > self.DTYPE.sz else None,
get_available_cast_dtypes(self.DTYPE)
))
def test_upcast_to_ops(self): list(map(
lambda dtype: _test_ops(a_dtype=dtype, b_dtype=self.DTYPE, target_dtype=self.DTYPE) if dtype.sz < self.DTYPE.sz else None,
get_available_cast_dtypes(self.DTYPE)
))
def _test_ops(a_dtype:DType, b_dtype:DType, target_dtype:DType):
if not is_dtype_supported(a_dtype) or not is_dtype_supported(b_dtype): raise unittest.SkipTest("dtype not supported")
_assert_eq(Tensor([1,2,3,4], dtype=a_dtype)+Tensor([1,2,3,4], dtype=b_dtype), target_dtype, [2,4,6,8])
_assert_eq(Tensor([1,2,3,4], dtype=a_dtype)*Tensor([1,2,3,4], dtype=b_dtype), target_dtype, [1,4,9,16])
_assert_eq(Tensor([[1,2],[3,4]], dtype=a_dtype)@Tensor.eye(2, dtype=b_dtype), target_dtype, [[1,2],[3,4]])
_assert_eq(Tensor([1,1,1,1], dtype=a_dtype)+Tensor.ones((4,4), dtype=b_dtype), target_dtype, 2*Tensor.ones(4,4).numpy())
class TestBFloat16DType(unittest.TestCase):
def setUp(self):
if not is_dtype_supported(dtypes.bfloat16): raise unittest.SkipTest("bfloat16 not supported")
def test_bf16_to_float(self):
with self.assertRaises(AssertionError):
_test_cast(Tensor([100000], dtype=dtypes.bfloat16), dtypes.float32, [100000])
def test_float_to_bf16(self):
with self.assertRaises(AssertionError):
_test_cast(Tensor([100000], dtype=dtypes.float32), dtypes.bfloat16, [100000])
# torch.tensor([10000, -1, -1000, -10000, 20]).type(torch.bfloat16)
def test_bf16(self):
t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.bfloat16)
t.realize()
back = t.cast(dtypes.float32)
assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)
def test_bf16_disk_write_read(self):
t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.float32)
t.to(f"disk:{temp('f32')}").realize()
# hack to "cast" f32 -> bf16
dat = open(temp('f32'), "rb").read()
adat = b''.join([dat[i+2:i+4] for i in range(0, len(dat), 4)])
with open(temp('bf16'), "wb") as f: f.write(adat)
t = Tensor.empty(5, dtype=dtypes.bfloat16, device=f"disk:{temp('bf16')}").llvm().realize()
back = t.cast(dtypes.float32)
assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)
class TestHalfDtype(TestDType): DTYPE = dtypes.half
class TestFloatDType(TestDType): DTYPE = dtypes.float
class TestDoubleDtype(TestDType): DTYPE = dtypes.double
class TestInt8Dtype(TestDType):
DTYPE = dtypes.int8
@unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
def test_int8_to_uint8_negative(self): _test_op(lambda: Tensor([-1, -2, -3, -4], dtype=dtypes.int8).cast(dtypes.uint8), dtypes.uint8, [255, 254, 253, 252])
class TestUint8Dtype(TestDType):
DTYPE = dtypes.uint8
@unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
def test_uint8_to_int8_overflow(self): _test_op(lambda: Tensor([255, 254, 253, 252], dtype=dtypes.uint8).cast(dtypes.int8), dtypes.int8, [-1, -2, -3, -4])
@unittest.skipIf(Device.DEFAULT not in {"CPU", "TORCH"}, "only bitcast in CPU and TORCH")
class TestBitCast(unittest.TestCase):
def test_float32_bitcast_to_int32(self): _test_bitcast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.int32, [1065353216, 1073741824, 1077936128, 1082130432])
@unittest.skipIf(Device.DEFAULT == "TORCH", "no uint32 in torch")
def test_float32_bitcast_to_uint32(self): _test_bitcast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.uint32, [1065353216, 1073741824, 1077936128, 1082130432])
def test_int32_bitcast_to_float32(self): _test_bitcast(Tensor([1065353216, 1073741824, 1077936128, 1082130432], dtype=dtypes.int32), dtypes.float32, [1.0, 2.0, 3.0, 4.0])
# NOTE: these are the same as normal casts
def test_int8_bitcast_to_uint8(self): _test_bitcast(Tensor([-1, -2, -3, -4], dtype=dtypes.int8), dtypes.uint8, [255, 254, 253, 252])
def test_uint8_bitcast_to_int8(self): _test_bitcast(Tensor([255, 254, 253, 252], dtype=dtypes.uint8), dtypes.int8, [-1, -2, -3, -4])
@unittest.skipIf(Device.DEFAULT == "TORCH", "no uint64 in torch")
def test_int64_bitcast_to_uint64(self): _test_bitcast(Tensor([-1, -2, -3, -4], dtype=dtypes.int64), dtypes.uint64, [18446744073709551615, 18446744073709551614, 18446744073709551613, 18446744073709551612])
@unittest.skipIf(Device.DEFAULT == "TORCH", "no uint64 in torch")
def test_uint64_bitcast_to_int64(self): _test_bitcast(Tensor([18446744073709551615, 18446744073709551614, 18446744073709551613, 18446744073709551612], dtype=dtypes.uint64), dtypes.int64, [-1, -2, -3, -4])
def test_shape_change_bitcast(self):
with self.assertRaises(AssertionError):
_test_bitcast(Tensor([100000], dtype=dtypes.float32), dtypes.uint8, [100000])
class TestInt16Dtype(TestDType): DTYPE = dtypes.int16
class TestUint16Dtype(TestDType): DTYPE = dtypes.uint16
class TestInt32Dtype(TestDType): DTYPE = dtypes.int32
class TestUint32Dtype(TestDType): DTYPE = dtypes.uint32
class TestInt64Dtype(TestDType): DTYPE = dtypes.int64
class TestUint64Dtype(TestDType): DTYPE = dtypes.uint64
class TestBoolDtype(TestDType): DTYPE = dtypes.bool
class TestEqStrDType(unittest.TestCase):
def test_image_ne(self):
assert dtypes.float == dtypes.float32, "float doesn't match?"
assert dtypes.imagef((1,2,4)) != dtypes.imageh((1,2,4)), "different image dtype doesn't match"
assert dtypes.imageh((1,2,4)) != dtypes.imageh((1,4,2)), "different shape doesn't match"
assert dtypes.imageh((1,2,4)) == dtypes.imageh((1,2,4)), "same shape matches"
assert isinstance(dtypes.imageh((1,2,4)), ImageDType)
def test_ptr_ne(self):
# TODO: is this the wrong behavior?
assert PtrDType(dtypes.float32) == dtypes.float32
#assert PtrDType(dtypes.float32) == PtrDType(dtypes.float32)
#assert PtrDType(dtypes.float32) != dtypes.float32
def test_strs(self):
self.assertEqual(str(dtypes.imagef((1,2,4))), "dtypes.imagef((1, 2, 4))")
self.assertEqual(str(PtrDType(dtypes.float32)), "ptr.dtypes.float")
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python
import gc
import unittest
import numpy as np
from tinygrad.tensor import Tensor
def tensors_allocated():
return sum([isinstance(x, Tensor) for x in gc.get_objects()])
class TestGC(unittest.TestCase):
def test_gc(self):
a = Tensor.zeros(4, 4, requires_grad=True)
b = Tensor.zeros(4, 4, requires_grad=True)
(a*b).mean().backward()
assert(tensors_allocated() > 0)
del a,b
assert(tensors_allocated() == 0)
def test_gc_complex(self):
a = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
b = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
assert(tensors_allocated() == 2)
(a*b).mean().backward()
assert(tensors_allocated() == 4)
del b
assert(tensors_allocated() == 2)
b = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
print(tensors_allocated())
(a*b).mean().backward()
print(tensors_allocated())
assert(tensors_allocated() == 4)
del b
assert(tensors_allocated() == 2)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.tensor import Tensor, Device
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
import pytest
pytestmark = pytest.mark.webgpu
# NOTE: METAL fails, might be platform and optimization options dependent.
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["METAL", "WEBGPU"], f"no JIT on {Device.DEFAULT}")
class TestJit(unittest.TestCase):
def test_simple_jit(self):
@TinyJit
def add(a, b): return (a+b).realize()
for _ in range(5):
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
c = add(a, b)
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
assert len(add.jit_cache) == 1
def test_jit_multiple_outputs(self):
@TinyJit
def f(a, b): return (a+b).realize(), (a-b).realize(), (a*b).realize()
for _ in range(5):
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
c, d, e = f(a, b)
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
np.testing.assert_allclose(d.numpy(), a.numpy()-b.numpy(), atol=1e-4, rtol=1e-5)
np.testing.assert_allclose(e.numpy(), a.numpy()*b.numpy(), atol=1e-4, rtol=1e-5)
assert len(f.jit_cache) == 3
def test_nothing_jitted(self):
@TinyJit
def add(a, b): return a+b
with self.assertRaises(AssertionError):
for _ in range(5):
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
c = add(a, b)
def test_jit_shape_mismatch(self):
@TinyJit
def add(a, b): return (a+b).realize()
for _ in range(5):
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
c = add(a, b)
bad = Tensor.randn(20, 20)
with self.assertRaises(AssertionError):
add(a, bad)
def test_jit_shape_views_mismatch(self):
@TinyJit
def add(a): return (a+1).realize()
with self.assertRaises(AssertionError):
for i in range(1,5):
# a has an offset that the kernel doesn't know about
a = Tensor.randn(10, 10).realize()[:, i:i+2]
add(a)
def test_jit_duplicate_fail(self):
# the jit doesn't support duplicate arguments
@TinyJit
def add(a, b): return (a+b).realize()
a = Tensor.randn(10, 10)
with self.assertRaises(AssertionError):
add(a, a)
def test_kwargs_jit(self):
@TinyJit
def add_kwargs(first, second): return (first+second).realize()
for _ in range(5):
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
c = add_kwargs(first=a, second=b)
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
assert len(add_kwargs.jit_cache) == 1
def test_array_jit(self):
@TinyJit
def add_array(a, arr): return (a+arr[0]).realize()
for i in range(5):
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
a.realize(), b.realize()
c = add_array(a, [b])
if i >= 2:
# should fail once jitted since jit can't handle arrays
np.testing.assert_allclose(np.any(np.not_equal(c.numpy(),a.numpy()+b.numpy())), True, atol=1e-4, rtol=1e-5)
else:
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
assert len(add_array.jit_cache) == 1
def test_method_jit(self):
class Fun:
def __init__(self):
self.a = Tensor.randn(10, 10)
@TinyJit
def __call__(self, b:Tensor) -> Tensor:
return (self.a+b).realize()
fun = Fun()
for _ in range(5):
b = Tensor.randn(10, 10)
c = fun(b)
np.testing.assert_allclose(c.numpy(), fun.a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
assert len(fun.__call__.func.__self__.jit_cache) == 1
def test_jit_size1_input(self):
@TinyJit
def f(a, b): return (a+b).realize()
a = Tensor([1, 2, 3])
for i in range(5):
np.testing.assert_allclose(f(a, Tensor([i])).numpy(), (a+i).numpy(), atol=1e-4, rtol=1e-5)
assert len(f.jit_cache) == 1
def test_jit_output_non_tensor_fail(self):
@TinyJit
def f(a, b, i): return (a+b).realize(), i
output1, output2 = [], []
expect1, expect2 = [], []
for i in range(5):
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
o1, o2 = f(a, b, i)
output1.append(o1.numpy().copy())
output2.append(o2)
expect1.append(a.numpy().copy()+b.numpy().copy())
expect2.append(i)
np.testing.assert_allclose(output1, expect1, atol=1e-4, rtol=1e-5)
# the jit only works with Tensor outputs
assert output2 != expect2
assert len(f.jit_cache) == 1
@unittest.skip("random isn't working in JIT")
def test_jit_random_regen(self):
def f(a, b):
rn = Tensor.randn(*a.shape)
return ((a+b)*rn).realize()
a = Tensor.randn(10, 10)
b = Tensor.randn(10, 10)
Tensor._seed = 1234
jf = TinyJit(f)
res = set()
for _ in range(5):
o1 = jf(a, b)
res.add(o1.numpy()[0][0])
assert len(res) == 5, "All values should be different, rand works in jit."
Tensor._seed = 1234
jf2 = TinyJit(f)
res2 = set()
for _ in range(5):
o1 = jf2(a, b)
res2.add(o1.numpy()[0][0])
assert len(res2) == 5, "All values should be different, rand works in jit."
assert res == res2, "Jit rand is not reproducible with the same seed"
Tensor._seed = 3421
jf3 = TinyJit(f)
res3 = set()
for _ in range(5):
o1 = jf3(a, b)
res3.add(o1.numpy()[0][0])
assert len(res3) == 5, "All values should be different, rand works in jit."
assert res3 != res2, "Jit rand is diff with diff seeds"
def test_jit_realization_and_sampling(self):
w = Tensor.eye(5)
@TinyJit
def foo (x): return w.dot(x).realize()
arg = [
Tensor([1,2,3,4,5]),
Tensor([1,3,3,4,6]),
Tensor([1,2,5,4,7]),
Tensor([0,2,3,1,0]),
]
Y = [foo(e).numpy() for e in arg]
foo(Tensor([7,7,7,7,7]))
want = [[1., 2., 3., 4., 5.],
[1., 3., 3., 4., 6.],
[1., 2., 5., 4., 7.],
[0., 2., 3., 1., 0.]]
np.testing.assert_allclose(want, Y)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python
import unittest
import secrets
import string
from tinygrad.tensor import Tensor
from tinygrad.ops import Device
from tinygrad.helpers import diskcache
def generate_random_string(length=16):
alphabet = string.ascii_letters + string.digits
return ''.join(secrets.choice(alphabet) for _ in range(length))
compile_call_count = 0
@diskcache
def helper_test_compile(prg:str) -> bytes:
global compile_call_count
compile_call_count += 1
return prg.encode()
class TestKernelCache(unittest.TestCase):
def test_compile_cache(self):
prg1 = generate_random_string(64) + "a"
prg2 = generate_random_string(64) + "b"
cold_compile_res = helper_test_compile(prg1)
warm_compile_res = helper_test_compile(prg1)
assert cold_compile_res == warm_compile_res == prg1.encode()
assert compile_call_count == 1
prg2_res = helper_test_compile(prg2)
assert prg2_res == prg2.encode()
assert compile_call_count == 2
def test_kernel_cache_in_action(self):
if Device.DEFAULT not in ["CLANG"]:
self.skipTest("No custom kernel cache is implemented")
a = Tensor.rand(4,4)
b = Tensor.rand(4,4)
x = a + b
x.realize()
orig_compile_func = Device['CLANG'].compiler
Device['CLANG'].compiler = None # making it not callable
a1 = Tensor.rand(4,4)
b1 = Tensor.rand(4,4)
x1 = a1 + b1
x1.realize() # Same kernel should be from cache.
Device['CLANG'].compiler = orig_compile_func
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,73 @@
#!/usr/bin/env python
import numpy as np
import unittest
from tinygrad.lazy import LazyBuffer
from tinygrad.ops import Device
from tinygrad.tensor import Tensor
from tinygrad.shape.symbolic import Variable
from tinygrad.jit import CacheCollector
class TestLazyBuffer(unittest.TestCase):
def test_fromcpu_buffer_sharing(self):
a = np.arange(8)
assert LazyBuffer.fromCPU(a).realized._buf is a
def test_fromcpu_shape_tracker(self):
def helper(a: np.ndarray):
print(a.shape, a.strides, a.flags.c_contiguous)
b = LazyBuffer.fromCPU(a)
#assert b.st.contiguous == a.flags.c_contiguous
assert b.st.shape == a.shape
np.testing.assert_equal(a, Tensor(b).numpy())
for ndims in range(1, 4):
a = np.random.randn(*(4,)*ndims).astype(np.float32)
for stride in [-2, 1, 2]:
for start in [0, 1]:
helper(a[(slice(start, None, stride),)*ndims])
def test_shuffle_pad_ops_cmpeq(self):
y = Tensor([1]).cat(Tensor([1]) == 0).numpy()
z = Tensor([1, 0]).numpy()
np.testing.assert_allclose(y, z)
def test_shuffle_pad_ops_div(self):
y = Tensor([1]).cat(Tensor([1]).div(Tensor([2.0]))).numpy()
z = Tensor([1, 0.5]).numpy()
np.testing.assert_allclose(y, z)
def test_shuffle_pad_ops_log(self):
y = Tensor([1]).cat(Tensor([1]).log()).numpy()
z = Tensor([1, 0]).numpy()
np.testing.assert_allclose(y, z)
def test_shuffle_pad_ops_exp(self):
y = Tensor([1]).cat(Tensor([1]).exp()).numpy()
z = Tensor([1, np.e]).numpy()
np.testing.assert_allclose(y, z)
@unittest.skipUnless(Device.DEFAULT in ["METAL", "CUDA", "GPU"], "Only GPU backends supports cache")
def test_children_count(self):
a = Tensor.ones(8,8,8)
d1 = a.sum((0))
d2 = a.sum((0)).reshape(32,2)
assert len(d1.lazydata.op.src[0].children) == 1
in1 = d1.reshape(16,4)
d3 = in1.reshape(8,8)
assert len(d3.lazydata.op.src[0].children) == 2
CacheCollector.start()
l = Tensor.ones(8,8)
r = Tensor.ones(8,8)
dd = d1 + l
dd.realize()
de = d3 + r
de.realize()
cache = CacheCollector.finish()
assert len(cache) == 3
assert cache[0][0].name.startswith("r_") # Reduce should not merged 2 times.
assert cache[1][0].name.startswith("E_")
assert cache[2][0].name.startswith("E_")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,21 @@
import unittest
from tinygrad.tensor import Tensor
# stuff needed to unpack a kernel
from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
from tinygrad.helpers import dtypes
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View
from tinygrad.shape.symbolic import Variable
inf, nan = float('inf'), float('nan')
class TestLazyOp(unittest.TestCase):
def test_lazyop_str(self):
t = Tensor.rand(10) + Tensor.rand(10)
s = t.lazydata.schedule()
ast = s[-1].ast
ast_remade = eval(str(ast))
self.assertEqual(ast, ast_remade)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,492 @@
import numpy as np
import unittest, os
from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
from tinygrad.codegen.linearizer import Linearizer, UOps
from tinygrad.ops import Compiled, Device, LoadOps
from tinygrad.tensor import Tensor
from tinygrad.jit import CacheCollector
from tinygrad.realize import run_schedule
from tinygrad.helpers import dtypes, prod
class TestLinearizer(unittest.TestCase):
def test_arg_dedup(self):
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled supports cache")
a, b = Tensor.randn(4), Tensor.randn(4)
np_a, np_b = a.numpy(), b.numpy()
CacheCollector.start()
c = ((a.shrink(((0, 2),)) - a.shrink(((2, 4),))) - (b.shrink(((0, 2),)) - b.shrink(((2, 4),)))).realize()
rawbufs = CacheCollector.finish()[0][1]
assert len(rawbufs) == 3 and set(rawbufs[1:]) == {a.lazydata.realized, b.lazydata.realized}
np_c = (np_a[:2] - np_a[2:]) - (np_b[:2] - np_b[2:])
np.testing.assert_allclose(np_c, c.numpy(), atol=1e-4, rtol=1e-4)
def test_load_dedup(self):
# for different leaves in the AST, the same loads may occur.
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled uses linearizer")
a = Tensor.randn(4).realize()
# these are of size 3 to avoid float4 coalesce
r = a[:-1] + a[1:]
k = Linearizer(r.lazydata.schedule()[-1].ast)
k.upcast()
k.linearize()
num_loads = len([uop for uop in k.uops if uop.uop == UOps.LOAD])
assert num_loads <= 4, "more load uops than needed"
assert num_loads >= 4, "unexpected number of uops, maybe this test needs updating?"
def test_upcast_cse(self):
# when upcasting, within a subtree, there may be common expressions.
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled uses linearizer")
a, b = Tensor.randn(1).realize(), Tensor.randn(1).realize()
r = a.expand([2]) + b.expand([2])
k = Linearizer(r.lazydata.schedule()[-1].ast)
k.upcast()
k.linearize()
num_ops = len([uop for uop in k.uops if uop.uop == UOps.ALU])
assert num_ops <= 1, "more alu uops than needed"
def test_zero_fold(self):
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled uses linearizer")
a, b = Tensor.randn(1).realize(), Tensor.randn(1).realize()
r = Tensor.stack([a, b])
k = Linearizer(r.lazydata.schedule()[-1].ast)
k.upcast()
k.linearize()
num_ops = len([uop for uop in k.uops if uop.uop == UOps.ALU])
assert num_ops == 0, "more alu uops than needed"
@unittest.skip("constant folding not supported yet")
def test_constant_fold(self):
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled uses linearizer")
a, b = Tensor(2), Tensor(3)
r = a * b
k = Linearizer(r.lazydata.schedule()[-1][0])
k.linearize()
num_ops = len([uop for uop in k.uops if uop.uop in [UOps.LOAD, UOps.ALU]])
assert num_ops <= 0, "more load or alu uops than needed"
def test_tensor_cores(self):
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled uses linearizer")
if Device.DEFAULT not in tensor_cores:
self.skipTest("No tensor cores for device")
for tc in tensor_cores[Device.DEFAULT]:
if tc.arch is not None and tc.arch != os.uname().machine: continue
a, b = Tensor.rand(tc.dims[0], tc.dims[2], dtype=tc.dtype_in), Tensor.rand(tc.dims[2], tc.dims[1], dtype=tc.dtype_in)
np_a, np_b = a.numpy(), b.numpy()
if tc.dtype_out != tc.dtype_in:
r = (a.reshape(tc.dims[0], 1, tc.dims[2]) * b.permute(1,0).reshape(1, tc.dims[1], tc.dims[2])).cast(tc.dtype_out).sum(axis=2)
else:
r = a @ b
realized_ast, _ = helper_realized_ast(r)
k = Linearizer(realized_ast)
k.apply_tensor_cores(1)
k.linearize()
assert len([uop for uop in k.uops if uop.uop == UOps.WMMA]) == 1, "tensor core not triggered"
np_c = np_a @ np_b
np.testing.assert_allclose(np_c, r.numpy(), atol=5e-3, rtol=1e-4)
def test_limit_dims_to_max_5d_global(self):
t = Tensor.rand(3, 4, 5, 6, 7).pad(((1, 1), (1, 1), (1, 1), (1, 1), (1, 1))) + 1
sched = [si for si in t.lazydata.schedule() if si.ast.op not in LoadOps]
assert len(sched) == 1
lin = Linearizer(sched[0].ast)
assert lin.full_shape[:lin.global_dims] == (5, 6, 7, 8, 9)
lin.limit_dims_to_max(global_max=[16, 16, 16], local_max=[16, 16, 16])
def helper_realized_ast(r:Tensor):
s = r.lazydata.schedule()
run_schedule(s[:-1]) # run all kernels except the last one
# now all input LazyBuffers buffers in s[-1] should be realized
output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args()) # allocate an output buffer
return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]
class TestFloat4(unittest.TestCase):
def setUp(self):
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.supports_float4:
self.skipTest("Device does not support float4")
@staticmethod
def count_float4(k):
return (len([uop for uop in k.uops if uop.uop == UOps.LOAD and uop.dtype == dtypes._float4]),
len([uop for uop in k.uops if uop.uop == UOps.STORE and len(uop.vin) == 3 and uop.vin[2].dtype == dtypes._float4]))
# TODO: express opts below as auto opts
def test_float4_basic(self):
a = Tensor.rand(2, 8).realize()
b = Tensor.rand(2, 8).realize()
c = a + b
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.hand_coded_optimizations()
k.linearize()
assert TestFloat4.count_float4(k) == (2, 1)
def test_float4_multidim(self):
a = Tensor.rand(2, 8).realize()
b = Tensor.rand(2, 8).realize()
c = a + b
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.shift_to(0, 4) # float4 dimension
k.shift_to(0, 2, insert_before=k.shape_len-1)
k.upcast()
k.upcast()
k.local_dims += 1
k.linearize()
assert TestFloat4.count_float4(k) == (4, 2)
def test_float4_unaligned_load(self):
a = Tensor.rand(9).realize().shrink(((1, 9),))
b = Tensor.rand(9).realize().shrink(((1, 9),))
c = a + b
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.hand_coded_optimizations() # implicit trigger float4 dim
k.linearize()
assert TestFloat4.count_float4(k) == (0, 1)
def test_float4_multidim_unaligned_load(self):
a = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
b = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
c = a + b
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.shift_to(len(k.full_unupcasted_shape)-1, 4) # manual trigger float4 dim
k.upcast()
k.shift_to(len(k.full_unupcasted_shape)-1, 2, insert_before=k.shape_len-1)
k.upcast()
k.local_dims += 1
k.linearize()
assert TestFloat4.count_float4(k) == (0, 2)
def test_float4_sometimes_unaligned(self):
a = Tensor.rand(1, 1, 8).realize()
b = Tensor.rand(1, 1, 5).realize().shrink(((0, 1), (0, 1), (1, 5)))
c = a.conv2d(b)
# only the first and last conv dot products are aligned in a, and b is never aligned, so no
# float4 should be emitted (the reduce axis of size 4 is the float4 axis here)
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.upcast()
k.linearize()
assert TestFloat4.count_float4(k) == (0, 0)
def test_float4_multidim_sometimes_unaligned(self):
a = Tensor.rand(1, 1, 7).realize()
b = Tensor.rand(1, 1, 5).realize().shrink(((0, 1), (0, 1), (1, 5)))
c = a.conv2d(b)
# the first conv dot product is aligned in a. If we upcast the output and reduce
# dimension, then we could do float4 for only that one set of loads, but we currently
# don't.
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.upcast()
k.upcast()
k.linearize()
assert TestFloat4.count_float4(k) == (0, 1)
def test_float4_noncontiguous(self):
a = Tensor.rand(4, 2).realize()
b = Tensor.rand(4, 2).realize()
c = a + b
# we will upcast the top axis of sz 4. they should not be coalesced into float4,
# since the top axis is not contiguous.
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.shift_to(0, 4, top=True) # top axes are float4 axes
k.upcast()
k.linearize()
assert TestFloat4.count_float4(k) == (0, 0)
def test_float4_expand(self):
a = Tensor.rand(9).realize().shrink(((1, 9),))
b = Tensor.rand(2).realize().reshape((2, 1)).expand((2,4)).reshape((8,))
c = a + b
# we will upcast the top axis of sz 4. they should not be coalesced into float4,
# since the top axis is not contiguous.
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.shift_to(0, 4) # float4 axis
k.upcast()
k.linearize()
assert TestFloat4.count_float4(k) == (0, 1)
def test_float4_heterogeneous(self):
a = Tensor.rand(8).realize()
b = Tensor.rand(9).realize().shrink(((1, 9),))
c = a + b
# should float4 b but not a
s = c.lazydata.schedule()[0]
k = Linearizer(s.ast)
k.shift_to(0, 4) # float4 axis
k.upcast()
k.linearize()
assert TestFloat4.count_float4(k) == (1, 1)
class TestHandCodedOpts(unittest.TestCase):
def setUp(self):
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Device does not use linearizer")
def test_masked_upcast(self):
layer_1 = Tensor.cat(*[Tensor.rand(5) for _ in range(4)])
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 20))
s = layer_2.lazydata.schedule()[-1]
k = Linearizer(s.ast)
k.hand_coded_optimizations()
assert len(k.bufs) == 6 # make sure all ops are done in one kernel
# masked upcast should upcast masked axis of size 7
# masked upcast should not upcast large (20) last axis
# float4/other hcopt shouldn't upcast last axis, since we already have 7 upcast, and the last axis is not very contiguous
assert k.upcasted == 1 and k.full_shape[-1] == 7
def test_masked_upcast_wino(self):
monster = Tensor.stack([Tensor.stack([Tensor.rand(16) for _ in range(6)]) for _ in range(6)])
s = monster.lazydata.schedule()[-1]
k = Linearizer(s.ast)
k.hand_coded_optimizations()
assert len(k.bufs) == 37 # make sure all ops are done in one kernel
# should upcast the two Tensor.stacks
assert k.upcasted >= 2 and k.full_shape[k.shape_len-k.upcasted:k.shape_len].count(6) == 2
def test_masked_upcast_wino_full(self):
old_wino = Tensor.wino
Tensor.wino = True
x,w = Tensor.rand(1,4,9,9, requires_grad=True).realize(), Tensor.rand(4,4,3,3, requires_grad=True).realize()
out = Tensor.conv2d(x,w, padding=1)
upcasts = []
# collect upcasts of tile transform kernels
for i, si in enumerate(out.lazydata.schedule()):
k = Linearizer(si.ast)
k.hand_coded_optimizations()
if k.reduceop is not None: continue # not a tile transform kernel (there is a gemm reduce kernel)
if len(k.bufs) < 100: continue # not a tile transform kernel (there's a permute kernel at the end)
upcasts.append(tuple(k.full_shape[k.shape_len - k.upcasted:k.shape_len]))
assert len(upcasts) == 3 # 3 transformation matrices
assert upcasts.count((6, 6)) == 2 and upcasts.count((4, 4)) == 1
out.mean().backward()
for si in x.grad.lazydata.schedule() + w.grad.lazydata.schedule():
k = Linearizer(si.ast)
k.hand_coded_optimizations()
k.linearize()
if len(k.bufs) < 20: continue # not a tile transform kernel
# heuristic number to make sure that at least some upcasts but not too many upcasts are being done
assert 6 <= prod(k.full_shape[k.shape_len - k.upcasted:k.shape_len]) <= 49
Tensor.wino = old_wino
def test_masked_upcast_many(self):
layer_1 = Tensor.cat(Tensor.rand(3, 4), Tensor.rand(4, 4))
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 7, 4))
layer_3 = Tensor.cat(layer_2.unsqueeze(0), Tensor.rand(6, 7, 7, 4))
s = layer_3.lazydata.schedule()[-1]
k = Linearizer(s.ast)
k.hand_coded_optimizations()
assert len(k.bufs) == 5 # make sure all ops are done in one kernel
# check that we don't do too many upcasts
assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49
def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
wanna_output = None
realized_ast, real_bufs = helper_realized_ast(r)
def check_opt(opts, create_k, to_prg):
k = create_k()
if apply_tc:
k.apply_tensor_cores(1, opts)
else:
for opt in opts:
k.apply_opt(opt)
prg = to_prg(k)
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
prg.exec(real_bufs, force_wait=True)
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
# Get baseline, which is not optimized at all.
k = Linearizer(realized_ast)
prg = Device[Device.DEFAULT].to_program(k)
prg.exec(real_bufs, force_wait=True)
wanna_output = real_bufs[0].toCPU().copy()
# Check correctness of handcoded optimiztions.
k = Linearizer(realized_ast)
k.hand_coded_optimizations()
prg = Device[Device.DEFAULT].to_program(k)
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
prg.exec(real_bufs, force_wait=True)
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
for x in opts: # Check custom transformations if any.
check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_program)
class TestLinearizerOpts(unittest.TestCase):
def test_local_and_grouped_reduce(self):
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
self.skipTest("Only Compiled uses linearizer with locals and shared")
N = 128
Tensor.manual_seed(1882)
a = Tensor.rand(4, 4, N, N)
b = Tensor.rand(4, 4, N)
r = (b.sqrt() + ((a+1).sum(axis=3).exp()))
helper_linearizer_opt(r, [
[Opt(OptOps.LOCAL, 0, 2)],
[Opt(OptOps.LOCAL, 0, 8)],
[Opt(OptOps.LOCAL, 0, 16)], # Checking how it works with locals
[Opt(OptOps.GROUPTOP, 0, 2)],
[Opt(OptOps.GROUPTOP, 0, 32)],
[Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with grouped reduce
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2)],
[Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.GROUPTOP, 0, 16)],
[Opt(OptOps.LOCAL, 0, 32), Opt(OptOps.GROUPTOP, 0, 2)],
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with locals + grouped reduce
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with locals + grouped reduce + upcasts
])
def test_upcasts(self):
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled uses linearizer")
N = 16
Tensor.manual_seed(1772)
a = Tensor.rand(N, N)
b = Tensor.rand(N, N)
r = (a+b).sqrt() * ((a+1).exp())
helper_linearizer_opt(r, [
[Opt(OptOps.UPCAST, 0, 2)],
[Opt(OptOps.UPCAST, 0, 4)],
[Opt(OptOps.UPCAST, 0, 8)], # Checking how it works with upcasts
])
def test_full_upcast(self):
if not isinstance(Device[Device.DEFAULT], Compiled):
self.skipTest("Only Compiled uses linearizer")
Tensor.manual_seed(1772)
a = Tensor.rand(4)
b = Tensor.rand(4)
r = (a+b).sqrt() * ((a+1).exp())
helper_linearizer_opt(r, [
[Opt(OptOps.UPCAST, 0, 4)], # Checking how it works with upcasts
])
def test_matmul(self):
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
self.skipTest("Only Compiled uses linearizer with locals and shared")
N = 128
Tensor.manual_seed(1552)
a = Tensor.rand(N, N)
b = Tensor.rand(N, N)
r = a@b
helper_linearizer_opt(r, [
[Opt(OptOps.UPCAST, 0, 2)],
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # Checking how it works with upcasts
[Opt(OptOps.LOCAL, 0, 2)],
[Opt(OptOps.LOCAL, 1, 32)],
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4)],
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 32)],
[Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.LOCAL, 1, 8)], # Checking how it works with locals
[Opt(OptOps.GROUPTOP, 0, 2)],
[Opt(OptOps.GROUPTOP, 0, 32)],
[Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.UNROLL, 0, 4)], # Checking how it works with grouped_reduce
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 32)],
[Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 32)],
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 4)], # Checking how it works with local+grouped_reduce
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 2)], # Checking all together
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 8)], # Full global upcast + local
])
def test_double_reduce(self):
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
self.skipTest("Only Compiled uses linearizer with locals and shared")
N = 128
Tensor.manual_seed(1552)
a = Tensor.rand(8, N, 8, N)
r = a.sum(axis=(1,3))
helper_linearizer_opt(r, [
# openCL / GPU=1 is 256 max threads
[Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)],
[Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce.
[Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)],
[Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2)],
[Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 64)], # Checking how it works with 2 grouped_reduces.
[Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 0, 4)],
[Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 2, 4)], # Checking how it works with 2 grouped_reduces + upcasts.
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4)],
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals.
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2)],
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals.
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 0, 2)], # No globals
])
def test_tensor_core_opts(self):
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local:
self.skipTest("Only Compiled uses linearizer with locals")
if Device.DEFAULT not in tensor_cores:
self.skipTest("No tensor cores for device")
N = 128
Tensor.manual_seed(1552)
a = Tensor.rand(N, N)
b = Tensor.rand(N, N)
r = a@b
helper_linearizer_opt(r, [
[Opt(OptOps.UPCAST, 0, 4)],
[Opt(OptOps.UPCAST, 1, 4)],
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # check upcasts
[Opt(OptOps.UNROLL, 0, 2)], # check last unroll
[Opt(OptOps.LASTLOCAL, 0, 4)], # check last local
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2)], # check combo of last unroll and last local
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 2)],
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4)],
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.LASTLOCAL, 0, 2)],
# [Opt(OptOps.GROUP, 0, 2)] # doesn't work because group_for_reduce dims become early locals (conflicting with TC)
], apply_tc=True)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,21 @@
import unittest
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.ops import Device
# stuff needed to unpack a kernel
from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
from tinygrad.helpers import dtypes
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View
from tinygrad.shape.symbolic import Variable
inf, nan = float('inf'), float('nan')
class TestLinearizerFailures(unittest.TestCase):
@unittest.skip("this is currently failing")
def test_failure_1(self):
ast = LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 16), strides=(16, 1, 0), offset=0, mask=None, contiguous=False),)))),), arg=(32, 16, 1)), LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=2, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),))))), arg=None), LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),))))), arg=None)
lin = Linearizer(ast)
prg = Device[Device.DEFAULT].to_program(lin)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python
import time
import cProfile
import pstats
import unittest
import torch
from tinygrad.tensor import Tensor, Device
import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
def start_profile():
import time
pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
pr.enable()
return pr
def stop_profile(pr, sort='cumtime', frac=0.2):
pr.disable()
ps = pstats.Stats(pr)
ps.strip_dirs()
ps.sort_stats(sort)
ps.print_stats(frac)
class TestConvSpeed(unittest.TestCase):
def test_mnist(self):
# https://keras.io/examples/vision/mnist_convnet/
conv = 3
inter_chan, out_chan = 32, 64
# ****** torch baseline *******
torch.backends.mkldnn.enabled = False
conv = 3
inter_chan, out_chan = 32, 64
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
c2d = torch.nn.functional.conv2d
mp = torch.nn.MaxPool2d((2,2))
lsm = torch.nn.LogSoftmax(dim=1)
cnt = 5
fpt, bpt = 0.0, 0.0
for i in range(cnt):
et0 = time.time()
x = torch.randn(128, 1, 28, 28, requires_grad=True)
x = mp(c2d(x,c1).relu())
x = mp(c2d(x,c2).relu())
x = x.reshape(x.shape[0], -1)
out = lsm(x.matmul(l1))
out = out.mean()
et1 = time.time()
out.backward()
et2 = time.time()
fpt += (et1-et0)
bpt += (et2-et1)
fpt_baseline = (fpt*1000/cnt)
bpt_baseline = (bpt*1000/cnt)
print("torch forward pass: %.3f ms" % fpt_baseline)
print("torch backward pass: %.3f ms" % bpt_baseline)
# ****** tinygrad compare *******
c1 = Tensor(c1.detach().numpy(), requires_grad=True)
c2 = Tensor(c2.detach().numpy(), requires_grad=True)
l1 = Tensor(l1.detach().numpy(), requires_grad=True)
cnt = 5
fpt, bpt = 0.0, 0.0
for i in range(1+cnt):
et0 = time.time()
x = Tensor.randn(128, 1, 28, 28)
x = x.conv2d(c1).relu().avg_pool2d()
x = x.conv2d(c2).relu().max_pool2d()
x = x.reshape(shape=(x.shape[0], -1))
out = x.dot(l1).log_softmax()
out = out.mean()
out.realize()
et1 = time.time()
out.backward()
[x.grad.realize() for x in [c1, c2, l1]]
et2 = time.time()
if i == 0:
pr = start_profile()
else:
fpt += (et1-et0)
bpt += (et2-et1)
stop_profile(pr, sort='time')
fpt = (fpt*1000/cnt)
bpt = (bpt*1000/cnt)
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,339 @@
#!/usr/bin/env python
import unittest
import numpy as np
from extra.utils import WINDOWS
from tinygrad.helpers import CI
from tinygrad.jit import TinyJit
from tinygrad.tensor import Tensor, Device
from tinygrad.nn import BatchNorm2d, Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, GroupNorm, LayerNorm, LayerNorm2d, Embedding, InstanceNorm
import torch
import pytest
pytestmark = [pytest.mark.exclude_cuda]
class TestNN(unittest.TestCase):
def test_sparse_cat_cross_entropy(self):
input = torch.randn(3, 5)
target = torch.empty(3, dtype=torch.long).random_(5)
loss_fun = torch.nn.CrossEntropyLoss(reduction='mean')
loss = loss_fun(input, target)
input_tiny = Tensor(input.detach().numpy())
taret_tiny = Tensor(target.detach().numpy())
loss_tiny = input_tiny.sparse_categorical_crossentropy(taret_tiny)
np.testing.assert_allclose(loss_tiny.numpy(), loss.detach().numpy(), atol=1e-5, rtol=1e-6)
def test_batchnorm2d(self, training=False):
szs = [4, 8, 16, 32]
for sz in szs:
# create in tinygrad
Tensor.training = training
bn = BatchNorm2d(sz, eps=1e-5, track_running_stats=training)
bn.weight = Tensor.randn(sz)
bn.bias = Tensor.randn(sz)
bn.running_mean = Tensor.randn(sz)
bn.running_var = Tensor.randn(sz)
bn.running_var.numpy()[bn.running_var.numpy() < 0] = 0
# create in torch
with torch.no_grad():
tbn = torch.nn.BatchNorm2d(sz).eval()
tbn.training = training
tbn.weight[:] = torch.tensor(bn.weight.numpy())
tbn.bias[:] = torch.tensor(bn.bias.numpy())
tbn.running_mean[:] = torch.tensor(bn.running_mean.numpy())
tbn.running_var[:] = torch.tensor(bn.running_var.numpy())
np.testing.assert_allclose(bn.running_mean.numpy(), tbn.running_mean.detach().numpy(), rtol=1e-5, atol=1e-6)
np.testing.assert_allclose(bn.running_var.numpy(), tbn.running_var.detach().numpy(), rtol=1e-5, atol=1e-6)
# trial
inn = Tensor.randn(2, sz, 3, 3)
# in tinygrad
outt = bn(inn)
# in torch
toutt = tbn(torch.tensor(inn.numpy()))
# close
np.testing.assert_allclose(outt.numpy(), toutt.detach().numpy(), rtol=5e-4, atol=1e-6)
np.testing.assert_allclose(bn.running_mean.numpy(), tbn.running_mean.detach().numpy(), rtol=1e-5, atol=1e-6)
np.testing.assert_allclose(bn.running_var.numpy(), tbn.running_var.detach().numpy(), rtol=1e-5, atol=1e-6)
def test_batchnorm2d_training(self):
self.test_batchnorm2d(True)
def test_linear(self):
def _test_linear(x):
# create in tinygrad
model = Linear(in_dim, out_dim)
z = model(x)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.Linear(in_dim, out_dim).eval()
torch_layer.weight[:] = torch.tensor(model.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(model.bias.numpy(), dtype=torch.float32)
torch_x = torch.tensor(x.numpy(), dtype=torch.float32)
torch_z = torch_layer(torch_x)
# test
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
BS, T, in_dim, out_dim = 4, 2, 8, 16
_test_linear(Tensor.randn(BS, in_dim))
_test_linear(Tensor.randn(BS, T, in_dim)) # test with more dims
def test_conv1d(self):
BS, C1, W = 4, 16, 224//4
C2, K, S, P = 64, 7, 2, 1
# create in tinygrad
layer = Conv1d(C1, C2, kernel_size=K, stride=S, padding=P)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.Conv1d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.uniform(BS, C1, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
def test_conv2d(self):
BS, C1, H, W = 4, 16, 224//4, 224//4
C2, K, S, P = 64, 7, 2, 1
# create in tinygrad
layer = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.Conv2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.uniform(BS, C1, H, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
@unittest.skipIf(Device.DEFAULT != "TORCH", "Takes too long to compile for Compiled backends")
def test_conv2d_winograd(self):
BS, C1, H, W = 2, 8, 16, 16
C2, K, S, P = 8, 3, 1, 1
old_wino = Tensor.wino
Tensor.wino = True
# create in tinygrad
layer = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
layer.weight.requires_grad = True
layer.bias.requires_grad = True
# create in torch
torch_layer = torch.nn.Conv2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
torch_layer.weight = torch.nn.Parameter(torch.tensor(layer.weight.numpy(), dtype=torch.float32))
torch_layer.bias = torch.nn.Parameter(torch.tensor(layer.bias.numpy(), dtype=torch.float32))
# test
x = Tensor.uniform(BS, C1, H, W, requires_grad=True)
z = layer(x)
torch_x = torch.tensor(x.numpy(), requires_grad=True)
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
m = z.mean()
m.backward()
gw = layer.weight.grad.realize()
gb = layer.bias.grad.realize()
gx = x.grad.realize()
torch_z.mean().backward()
np.testing.assert_allclose(gw.numpy(), torch_layer.weight.grad.numpy(), atol=5e-4, rtol=1e-5)
np.testing.assert_allclose(gb.numpy(), torch_layer.bias.grad.numpy(), atol=5e-4, rtol=1e-5)
np.testing.assert_allclose(gx.numpy(), torch_x.grad.numpy(), atol=5e-4, rtol=1e-5)
Tensor.wino = old_wino
@unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
def test_conv_transpose1d(self):
BS, C1, W = 4, 16, 224//4
C2, K, S, P = 64, 7, 2, 1
# create in tinygrad
layer = ConvTranspose1d(C1, C2, kernel_size=K, stride=S, padding=P)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.ConvTranspose1d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.uniform(BS, C1, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
@unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
def test_conv_transpose2d(self):
BS, C1, H, W = 4, 16, 224//4, 224//4
C2, K, S, P = 64, 7, 2, 1
# create in tinygrad
layer = ConvTranspose2d(C1, C2, kernel_size=K, stride=S, padding=P)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.ConvTranspose2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.uniform(BS, C1, H, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
def test_groupnorm(self):
BS, H, W, C, G = 20, 10, 10, 6, 3
# create in tinygrad
layer = GroupNorm(G, C)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.GroupNorm(G, C).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.randn(BS, C, H, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
def test_layernorm(self):
N, C, H, W = 20, 5, 10, 10
# create in tinygrad
layer = LayerNorm([H, W])
# create in torch
with torch.no_grad():
torch_layer = torch.nn.LayerNorm([H, W]).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.randn(N, C, H, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
def test_layernorm_2d(self):
N, C, H, W = 20, 5, 10, 10
# create in tinygrad
layer = LayerNorm2d(C)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.LayerNorm([C]).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.randn(N, C, H, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x.permute(0,2,3,1)).permute(0,3,1,2)
def test_instancenorm_2d(self):
N, C, H, W = 20, 5, 10, 10
# create in tinygrad
layer = InstanceNorm(C)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.InstanceNorm2d(C, affine=True).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.randn(N, C, H, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
def test_instancenorm_3d(self):
N, C, D, H, W = 20, 5, 3, 10, 10
# create in tinygrad
layer = InstanceNorm(C)
# create in torch
with torch.no_grad():
torch_layer = torch.nn.InstanceNorm3d(C, affine=True).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
# test
x = Tensor.randn(N, C, D, H, W)
z = layer(x)
torch_x = torch.tensor(x.numpy())
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
def test_embedding(self):
B, T, C, VS = 4, 10, 20, 28
# create in tinygrad
layer = Embedding(VS, C)
with torch.no_grad():
torch_layer = torch.nn.Embedding(VS, C).eval()
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
# test
x = Tensor(np.random.randint(0, VS, (B, T)).astype(np.float32))
z = layer(x)
torch_x = torch.tensor(x.numpy().astype(np.int32))
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
# test with jit enabled
@TinyJit
def layer_jit(x):
return layer(x).realize()
for _ in range(3):
x = Tensor(np.random.randint(0, VS, (B, T)).astype(np.float32))
z = layer_jit(x)
torch_x = torch.tensor(x.numpy().astype(np.int32))
torch_z = torch_layer(torch_x)
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
if __name__ == '__main__':
unittest.main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,98 @@
import numpy as np
import torch
import unittest
from tinygrad.tensor import Tensor
from tinygrad.nn.optim import Adam, SGD, AdamW
import pytest
pytestmark = pytest.mark.exclude_cuda
np.random.seed(1337)
x_init = np.random.randn(1,4).astype(np.float32)
W_init = np.random.randn(4,4).astype(np.float32)
m_init = np.random.randn(1,4).astype(np.float32)
class TinyNet:
def __init__(self, tensor):
self.x = tensor(x_init.copy(), requires_grad=True)
self.W = tensor(W_init.copy(), requires_grad=True)
self.m = tensor(m_init.copy())
def forward(self):
out = self.x.matmul(self.W).relu()
# print(out.detach().numpy())
out = out.log_softmax(1)
out = out.mul(self.m).add(self.m).sum()
return out
def step(tensor, optim, steps=1, kwargs={}):
net = TinyNet(tensor)
optim = optim([net.x, net.W], **kwargs)
for _ in range(steps):
out = net.forward()
optim.zero_grad()
out.backward()
optim.step()
return net.x.detach().numpy(), net.W.detach().numpy()
class TestOptim(unittest.TestCase):
def _test_optim(self, tinygrad_optim, torch_optim, steps, opts, atol, rtol):
for x,y in zip(step(Tensor, tinygrad_optim, steps, kwargs=opts),
step(torch.tensor, torch_optim, steps, kwargs=opts)):
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
def _test_sgd(self, steps, opts, atol, rtol): self._test_optim(SGD, torch.optim.SGD, steps, opts, atol, rtol)
def _test_adam(self, steps, opts, atol, rtol): self._test_optim(Adam, torch.optim.Adam, steps, opts, atol, rtol)
def _test_adamw(self, steps, opts, atol, rtol): self._test_optim(AdamW, torch.optim.AdamW, steps, opts, atol, rtol)
def test_sgd(self): self._test_sgd(1, {'lr': 0.001}, 1e-6, 0)
def test_sgd_high_lr(self): self._test_sgd(1, {'lr': 10}, 1e-6, 1e-5)
def test_sgd_wd(self): self._test_sgd(1, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
def test_sgd_high_lr_wd(self): self._test_sgd(1, {'lr': 10, 'weight_decay': 0.1}, 1e-6, 1e-5)
def test_multistep_sgd(self): self._test_sgd(10, {'lr': 0.001}, 1e-6, 0)
def test_multistep_sgd_high_lr(self): self._test_sgd(10, {'lr': 10}, 1e-6, 3e-4)
def test_multistep_sgd_wd(self): self._test_sgd(10, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
def test_multistep_sgd_high_lr_wd(self): self._test_sgd(10, {'lr': 9, 'weight_decay': 0.1}, 1e-6, 3e-4)
def test_multistep_sgd_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9}, 1e-6, 0)
def test_multistep_sgd_high_lr_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9}, 1e-5, 3e-4)
def test_multistep_sgd_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-6, 0)
def test_multistep_sgd_high_lr_momentum_wd(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-5, 3e-4)
def test_multistep_sgd_nesterov_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True}, 1e-5, 0)
def test_multistep_sgd_high_lr_nesterov_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'nesterov': True}, 1e-5, 3e-4)
def test_multistep_sgd_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 0)
def test_multistep_sgd_high_lr_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 9, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 3e-4)
def test_adam(self): self._test_adam(1, {'lr': 0.001}, 1e-5, 0)
def test_adam_high_lr(self): self._test_adam(1, {'lr': 10}, 1e-4, 1e-4)
def test_adamw(self): self._test_adamw(1, {'lr': 0.001}, 1e-5, 0)
def test_adamw_high_lr(self): self._test_adamw(1, {'lr': 10}, 1e-4, 1e-4)
def test_multistep_adam(self): self._test_adam(10, {'lr': 0.001}, 1e-5, 0)
def test_multistep_adam_high_lr(self): self._test_adam(10, {'lr': 10}, 2e-4, 5e-4)
def test_multistep_adamw(self): self._test_adamw(10, {'lr': 0.001}, 1e-5, 0)
def test_multistep_adamw_high_lr(self): self._test_adamw(10, {'lr': 10}, 5e-4, 2e-3)
def test_duped_weights(self):
for Opt in [Adam, AdamW, SGD]:
losses = []
for i in range(2):
w = Tensor(x_init.copy())
opt = Opt([w], lr=0.1) if i == 0 else Opt([w, w], lr=0.1)
loss = None
for _ in range(3):
loss = w.sum()
opt.zero_grad()
loss.backward()
opt.step()
losses.append(loss.numpy())
np.testing.assert_allclose(losses[0], losses[1], atol=1e-4, rtol=0)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,115 @@
import math
import unittest
import numpy as np
import torch
from tinygrad.tensor import Tensor
import tinygrad.nn as nn
import pytest
from tinygrad.helpers import dtypes
from functools import partial
pytestmark = pytest.mark.webgpu
# https://gist.github.com/devries/11405101
def ksprob(a):
fac, total, termbf = 2.0, 0.0, 0.0
a2 = -2.0 * a * a
for j in range(1, 101):
term = fac * math.exp(a2 * j * j)
total += term
if math.fabs(term) <= 0.001 * termbf or math.fabs(term) <= 1e-8 * total:
return total
fac = -fac
termbf = math.fabs(term)
return 1.0
def kstest(l1, l2):
n1, n2 = len(l1), len(l2)
l1.sort()
l2.sort()
j1, j2, d, fn1, fn2 = 0, 0, 0.0, 0.0, 0.0
while j1 < n1 and j2 < n2:
d1, d2 = l1[j1], l2[j2]
if d1 <= d2:
fn1 = (float(j1) + 1.0) / float(n1)
j1 += 1
if d2 <= d1:
fn2 = (float(j2) + 1.0) / float(n2)
j2 += 1
dtemp = math.fabs(fn2 - fn1)
if dtemp > d:
d = dtemp
ne = float(n1 * n2) / float(n1 + n2)
nesq = math.sqrt(ne)
prob = ksprob((nesq + 0.12 + 0.11 / nesq) * d)
return prob
def equal_distribution(tiny_func, torch_func=None, numpy_func=None, shape=(20, 23), alpha=0.05):
Tensor.manual_seed(1337)
torch.manual_seed(1337)
np.random.seed(1337)
assert not (torch_func is None and numpy_func is None), "no function to compare with"
x = tiny_func(*shape).numpy().flatten()
if numpy_func is not None: y = numpy_func(shape).flatten()
if torch_func is not None: z = torch_func(shape).numpy().flatten()
return (numpy_func is None or kstest(x, y) >= alpha) and (torch_func is None or kstest(x, z) >= alpha)
def normal_test(func, shape=(20, 23), alpha=0.05): return equal_distribution(func, numpy_func=lambda x: np.random.randn(*x), shape=shape, alpha=alpha)
class TestRandomness(unittest.TestCase):
def test_rand(self):
self.assertFalse(normal_test(Tensor.rand))
self.assertTrue(equal_distribution(Tensor.rand, torch.rand, lambda x: np.random.rand(*x)))
def test_randn(self):
self.assertTrue(normal_test(Tensor.randn))
self.assertTrue(equal_distribution(Tensor.randn, torch.randn, lambda x: np.random.randn(*x)))
def test_normal(self):
self.assertTrue(normal_test(Tensor.normal))
self.assertTrue(equal_distribution(Tensor.normal, lambda x: torch.nn.init.normal_(torch.empty(x), mean=0, std=1), lambda x: np.random.normal(loc=0, scale=1, size=x)))
def test_uniform(self):
self.assertFalse(normal_test(Tensor.uniform))
self.assertTrue(equal_distribution(Tensor.uniform, lambda x: torch.nn.init.uniform_(torch.empty(x)), lambda x: np.random.uniform(size=x)))
self.assertTrue(equal_distribution(partial(Tensor.uniform, low=-100, high=100, dtype=dtypes.int32), numpy_func=lambda x: np.random.randint(low=-100, high=100, size=x)))
def test_scaled_uniform(self):
self.assertFalse(normal_test(Tensor.scaled_uniform))
self.assertTrue(equal_distribution(Tensor.scaled_uniform, lambda x: torch.nn.init.uniform_(torch.empty(x), a=-1, b=1) / math.sqrt(math.prod(x)), lambda x: np.random.uniform(-1, 1, size=x) / math.sqrt(math.prod(x))))
def test_glorot_uniform(self):
self.assertFalse(normal_test(Tensor.glorot_uniform))
self.assertTrue(equal_distribution(Tensor.glorot_uniform, lambda x: torch.nn.init.xavier_uniform_(torch.empty(x)), lambda x: np.random.uniform(-1, 1, size=x) * math.sqrt(6 / (x[0] + math.prod(x[1:])))))
def test_kaiming_uniform(self):
Tensor.manual_seed(1337)
torch.manual_seed(1337)
np.random.seed(1337)
for shape in [(128, 64, 3, 3), (20, 24)]:
self.assertTrue(equal_distribution(Tensor.kaiming_uniform, lambda x: torch.nn.init.kaiming_uniform_(torch.empty(x)), shape=shape))
def test_kaiming_normal(self):
Tensor.manual_seed(1337)
torch.manual_seed(1337)
np.random.seed(1337)
for shape in [(128, 64, 3, 3), (20, 24)]:
self.assertTrue(equal_distribution(Tensor.kaiming_normal, lambda x: torch.nn.init.kaiming_normal_(torch.empty(x)), shape=shape))
def test_conv2d_init(self):
params = (128, 256, (3,3))
assert equal_distribution(lambda *_: nn.Conv2d(*params).weight, lambda _: torch.nn.Conv2d(*params).weight.detach())
assert equal_distribution(lambda *_: nn.Conv2d(*params).bias, lambda _: torch.nn.Conv2d(*params).bias.detach())
def test_linear_init(self):
params = (64, 64)
assert equal_distribution(lambda *_: nn.Linear(*params).weight, lambda _: torch.nn.Linear(*params).weight.detach())
assert equal_distribution(lambda *_: nn.Linear(*params).bias, lambda _: torch.nn.Linear(*params).bias.detach())
def test_bn_init(self):
params = (64,)
assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).weight, lambda _: torch.nn.BatchNorm2d(*params).weight.detach())
assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).bias, lambda _: torch.nn.BatchNorm2d(*params).bias.detach())
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,335 @@
# this will be the new test_ops for the next level
# schedule confirms the right things are capable of fusing
# NOTE: this has overlap with external_test_opt.py
import unittest
from typing import List, Optional
from tinygrad.tensor import Tensor
from tinygrad.ops import LoadOps, Device, Compiled
from tinygrad.helpers import DEBUG, dtypes
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.graph import log_schedule_item, print_tree
from tinygrad import nn
def check_schedule(t:Tensor, allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_loadops=True):
seen = set()
if to_prerealize:
for pre in to_prerealize:
for s in pre.lazydata.schedule(seen.copy()):
log_schedule_item(s)
seen.add(s.out)
sched = t.lazydata.schedule(seen)
for s in sched: log_schedule_item(s)
if filter_loadops: sched = [s for s in sched if s.ast.op not in LoadOps]
if len(sched) != allowed: print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}")
if len(sched) != allowed or DEBUG >= 3:
for i, s in enumerate(sched):
print("op", i)
print_tree(s.ast)
assert len(sched) == allowed
# test the (non loadops) ops linearize
for s in sched:
if s.ast.op in LoadOps: continue
l = Linearizer(s.ast)
l.hand_coded_optimizations()
l.linearize()
class TestSchedule(unittest.TestCase):
def test_basic_binop_fusion(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = Tensor.empty(10)
d = a+b+c
check_schedule(d, 1)
def test_basic_binop_fusion_deep(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = Tensor.empty(10)
d = Tensor.empty(10)
e = a+b+c+d
check_schedule(e, 1)
def test_mulacc_fusion(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = (a*b).sum()
check_schedule(c, 1)
def test_mulacc_relu_fusion(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = (a*b).sum().relu()
check_schedule(c, 1)
def test_binop_reshape_fusion(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = Tensor.empty(5,2)
d = (a+b).reshape(5,2)+c
check_schedule(d, 1)
def test_binop_permute_fusion(self):
a = Tensor.empty(2,5)
b = Tensor.empty(2,5)
c = Tensor.empty(5,2)
d = (a+b).permute(1,0)+c
check_schedule(d, 1)
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or Device.DEFAULT == "LLVM", "only test for compiled backends")
def test_constants_are_embedded(self):
a = Tensor.empty(3,3) * 2
check_schedule(a, 2, filter_loadops=False)
def test_binop_elu_fusion(self):
a = Tensor.empty(10)
b = a.elu()
check_schedule(b, 1)
def test_binop_reshape_reduce_fusion(self):
a = Tensor.empty(100)
b = Tensor.empty(100)
c = (a+b).reshape(10, 10).sum(axis=0, keepdim=True)
check_schedule(c, 1)
def test_reduce_reshape_binop_fusion(self):
a = Tensor.empty(10,10)
b = Tensor.empty(10)
c = a.sum(axis=0) + b
check_schedule(c, 1)
@unittest.skip("not pushing permutes through reduces")
def test_reduce_permute_binop_fusion(self):
a = Tensor.empty(10,10,10)
b = Tensor.empty(10,10,1)
c = a.sum(axis=0, keepdim=True).permute(2,1,0) + b
check_schedule(c, 1)
def test_binop_early_reshape_reduce_fusion(self):
a = Tensor.empty(100)
b = Tensor.empty(100)
c = Tensor.empty(10,10)
d = ((a+b).reshape(10,10) + c).sum(axis=0)
check_schedule(d, 1)
def test_diamond_folded(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = Tensor.empty(10)
d = Tensor.empty(10)
ab = a+b
e = (ab+c) + (ab+d)
check_schedule(e, 1)
def test_cache_binaryop(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = a+b
d = a+b
check_schedule(d, 0, [c])
@unittest.skip("failing in old lazy")
def test_cache_binaryop_reshaped(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = a+b
d = a.reshape(10,1)+b.reshape(10,1)
check_schedule(d, 0, [c])
def test_cache_binaryop_transpose(self):
a = Tensor.empty(10,10)
b = Tensor.empty(10,10)
c = (a.T*b.T).T #.contiguous()
d = a*b
check_schedule(d, 0, [c])
def test_cache_two_reduceops(self):
a = Tensor.empty(10)
b = a.sum()
c = a.sum()
bc = b+c
check_schedule(bc, 1)
def test_fold_double_unary(self):
y = Tensor.empty(2)
out = y.sum(keepdim=True).sqrt().__neg__()
check_schedule(out, 1)
#@unittest.skip("may want to reconsider this")
def test_fold_batchnorm(self):
with Tensor.train():
img = Tensor.empty(1,32,4,4)
bn = nn.BatchNorm2d(32, track_running_stats=False)
out = bn(img)
check_schedule(out, 3)
def test_fold_conv_relu(self):
c1 = nn.Conv2d(3,16,3)
# run
img = Tensor.ones(2,3,64,64)
out = c1(img).relu()
check_schedule(out, 1, [c1.weight, c1.bias])
def test_fold_conv_elu(self):
c1 = nn.Conv2d(3,16,3)
# run
img = Tensor.rand(2,3,64,64)
out = c1(img).elu()
check_schedule(out, 1, [c1.weight, c1.bias])
def test_two_sum(self):
img = Tensor.empty(64,64)
x = (img.sum(0) + img.sum(1))
out = x.relu()
del x # is 3 without this
check_schedule(out, 2)
@unittest.skip("failing in old lazy")
def test_push_permute_through_reshape(self):
a = Tensor.empty(16,16)
b = Tensor.empty(16,16)
c = (a+b).reshape(4,4,4,4).permute(2,3,0,1).contiguous()
check_schedule(c, 1)
@unittest.skip("failing in old lazy")
def test_push_permute_through_reshape_alt(self):
a = Tensor.empty(4,4,4,4)
b = Tensor.empty(4,4,4,4)
c = (a+b).reshape(16,16).permute(1,0).contiguous()
check_schedule(c, 1)
def test_no_binop_rerun(self):
a = Tensor.empty(16)
b = Tensor.empty(16)
c = a+b
d = (a+b).reshape(16,1)
check_schedule(d, 0, [c])
def test_multi_permute_should_collapse(self):
a = Tensor.empty(4,4,4,4)
b = Tensor.empty(16)
c = a.sum((0,1)).cast(dtypes.float16).permute(1,0).reshape(4,4,1).permute(1,0,2).reshape(16) + b
check_schedule(c, 1)
@unittest.skip("failing in old lazy")
def test_fancy_reshape_fusion(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = a+b
d = a.reshape(10,1)+b.reshape(10,1)
out = c.sum() + d.sum()
check_schedule(out, 1)
# NOTE: for this to pass, LazyViews must be children of LazyBuffers so the (a+b) runs first
@unittest.skip("not real world")
def test_children_dont_push(self):
a = Tensor.empty(10, 10, 1)
b = Tensor.empty(10, 10, 1)
d = (a+b).expand(10, 10, 10)
e = (a+b).permute(2,1,0)
f = d+e
check_schedule(f, 2)
def test_dont_fuse_binops_with_children(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
c = Tensor.empty(10)
keep_me = a+b
e = keep_me.sum() # give keep_me a child (NOTE: BinaryOps won't be a child since it will instant fuse)
d = keep_me+c
check_schedule(d, 2)
check_schedule(keep_me, 0, [d])
@unittest.skip("failing in old lazy")
def test_permute_breaks_fusion(self):
a = Tensor.empty(10, 10, 10)
b = Tensor.empty(10, 10)
c = (a.sum(axis=2) + b).permute(1,0)
d = c.permute(1,0)
check_schedule(d, 1)
def test_some_permute_fusion(self):
a = Tensor.empty(8192, 16)
b = Tensor.empty(1, 16)
d = (a.T + b.expand(8192, 16).T)
c = a + b.expand(8192, 16)
e = d.T
check_schedule(c, 1)
check_schedule(e, 1)
# this is the failing case in openpilot...it's very simple like this
@unittest.skip("failing in old lazy")
def test_image_conv_fusion(self):
from tinygrad.features.image import image_conv2d
w1 = Tensor.empty(16, 16, 1, 1)
b1 = Tensor.empty(16)
w2 = Tensor.empty(16, 16, 1, 1)
b2 = Tensor.empty(16)
w3 = Tensor.empty(16, 16, 1, 1)
b3 = Tensor.empty(16)
x = Tensor.empty(1, 16, 32, 32)
x = base = image_conv2d(x, w1, b1)
x = image_conv2d(x, w2, b2) + base
x = image_conv2d(x, w3, b3)
# NOOP, 3 convs, contiguous
check_schedule(x, 5)
def test_image_conv_fusion_minimal(self):
b1 = Tensor.empty(16)
b2 = Tensor.empty(16)
def p(x): return x.permute(1,0).contiguous().reshape(32,16,1).expand(32,16,16).sum(axis=2).permute(1,0)
x = Tensor.empty(16, 32)
x = base = p(x) + b1.reshape(16,1)
x = p(x)
x = x + b2.reshape(16,1)
x = x + base
del base
x = p(x)
check_schedule(x, 4)
def test_image_conv_fusion_more_minimal(self):
b1 = Tensor.empty(16)
def p(x): return x.permute(1,0).contiguous().reshape(32,16,1).expand(32,16,16).sum(axis=2).permute(1,0)
x = Tensor.empty(16, 32)
x = base = p(x) + b1.reshape(16,1)
x = p(x)
del base
check_schedule(x, 3)
def test_resnet_block(self):
from models.resnet import BasicBlock
Tensor.training = False
bb = BasicBlock(64,64)
x = Tensor.empty(1, 64, 32, 32)
out = bb(x)
check_schedule(out, 4)
def test_contiguous_while_contiguous(self):
x = Tensor.empty(1, 64, 32, 32)
out = x.contiguous()
check_schedule(out, 1, filter_loadops=False)
def test_contiguous_while_not_contiguous(self):
x = Tensor.empty(1, 64, 32, 32)
out = x.permute(0,2,3,1).contiguous()
check_schedule(out, 2, filter_loadops=False)
def test_double_from(self):
x = Tensor([1,2,3,4])
out = x.to('cpu')
check_schedule(out, 0, filter_loadops=False)
def test_pow_const_tensor(self):
x = Tensor([1,2,3,4])
out = x ** Tensor(2)
check_schedule(out, 1)
if __name__ == '__main__':
unittest.main(verbosity=2)

View File

@@ -0,0 +1,19 @@
import unittest
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.features.search import time_linearizer
from tinygrad.ops import Compiled, Device, LoadOps
from tinygrad.tensor import Tensor
class TestTimeLinearizer(unittest.TestCase):
def setUp(self) -> None:
if not isinstance(Device[Device.DEFAULT], Compiled): raise unittest.SkipTest("only test for compiled backends")
def test_reasonable_time(self):
si = [si for si in Tensor([1,2,3,4]).add(1).lazydata.schedule() if si.ast.op not in LoadOps][0]
rawbufs = [Device[Device.DEFAULT].buffer(si.out.st.size(), si.out.dtype)] + [Device[Device.DEFAULT].buffer(x.st.size(), x.dtype) for x in si.inputs]
tm = time_linearizer(Linearizer(si.ast), rawbufs, allow_test_size=False, cnt=10)
assert tm > 0 and tm != float('inf')
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,57 @@
import unittest
from tinygrad.tensor import Tensor
from tinygrad.helpers import dtypes
from tinygrad.ops import Device
import pytest
# similar to test/external/external_test_gpu_ast.py, but universal
pytestmark = pytest.mark.exclude_cuda
class TestSpecific(unittest.TestCase):
# from openpilot
# 1x1 6 <- 24
def test_1x1_6_24(self):
x = Tensor.randn(1, 24*4, 32, 64)
w = Tensor.randn(6*4, 24*4, 1, 1)
x.conv2d(w).permute(0,2,3,1).reshape(32, 384, 4).contiguous().realize()
def test_vec_mul(self):
# this forces it to be an image...
x = Tensor.ones(1, 512, 4).contiguous().reshape(1, 2048)
w = Tensor.randn(2048, 512)
(x @ w).reshape(1, 128, 4).contiguous().realize()
@unittest.skipIf(Device.DEFAULT in ["LLVM", "WEBGPU"], "Broken on LLVM and webgpu")
def test_big_vec_mul(self):
# from LLaMA
# 0 buffer<4096, dtypes.float> [View((1024, 1, 1, 4), (4, 0, 0, 1), 0, None)]
# 1 buffer<4096, dtypes.float> [View((1024, 1024, 4, 4), (0, 4, 1, 0), 0, None)]
# 2 buffer<16777216, dtypes.half> [View((1024, 1024, 4, 4), (16384, 4, 1, 4096), 0, None)]
x = Tensor.randn(4096).realize()
w = Tensor.randn(4096, 4096, device='cpu').cast(dtypes.float16).to(Device.DEFAULT).realize()
(x @ w.T).realize()
# from https://dl.acm.org/doi/pdf/10.1145/3495243.3517020
# ~260 GFLOPS on Adreno 640, should be 260*(720/890)*(596/710) = 176.5 on downclocked 630
# we get 170
def test_1x1_28_28(self):
x = Tensor.randn(1, 256, 28, 28)
w = Tensor.randn(256, 256, 1, 1)
x.conv2d(w).permute(0,2,3,1).reshape(28, 28*256//4, 4).contiguous().realize()
# 132 GFLOPS on Adreno 640, should be 132*(720/890)*(596/710) = 90 on downclocked 630
# gets 54 with broken opt, 74 without opt, and 146 if we pad and opt 3!
def test_3x3_28_28_stride_2(self):
x = Tensor.randn(1, 288, 36, 36)
w = Tensor.randn(384, 288, 3, 3)
x.conv2d(w, stride=2).permute(0,2,3,1).reshape(17, 17*384//4, 4).contiguous().realize()
def test_3x3_28_28_stride_2_padded(self):
x = Tensor.randn(1, 288, 36, 36)
w = Tensor.randn(384, 288, 3, 3)
x.conv2d(w, stride=2, padding=1).permute(0,2,3,1).reshape(18, 18*384//4, 4).contiguous().realize()
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,288 @@
import os
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
import unittest
import torch
torch.set_num_threads(1)
import time
import numpy as np
np.set_printoptions(linewidth=160)
from tinygrad.ops import Device
from tinygrad.helpers import GlobalCounters
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d
from tinygrad.helpers import colored, getenv, CI
from tinygrad.jit import TinyJit
import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
torch_dt = torch.float16 if getenv("HALF", 0) else torch.float32
torch_device = torch.device('mps' if getenv("MPS", 0) else ('cuda' if getenv("TORCHCUDA", 0) else 'cpu'))
if str(torch_device) == "mps":
import torch.mps
sync = lambda: torch.mps.synchronize()
elif str(torch_device) == "cuda":
import torch.cuda
sync = lambda: torch.cuda.synchronize()
else:
sync = lambda: None
def colorize_float(x):
ret = f"{x:7.2f}x"
if x < 0.75:
return colored(ret, 'green')
elif x > 1.15:
return colored(ret, 'red')
else:
return colored(ret, 'yellow')
save_ops, save_mem = 0, 0
CNT = getenv("CNT", 8)
def helper_test_speed(f1, *args):
global save_ops, save_mem
ets = []
ret = None
cache_defeat = np.zeros((2048,2048))
for i in range(CNT):
del ret
# operation cache defeats
args = [(x+1).realize() if isinstance(x, Tensor) else (None if x is None else (x+1)) for x in args]
# force syncing
[x.numpy() if isinstance(x, Tensor) or str(torch_device) == "cpu" else x.cpu().numpy() for x in args if x is not None]
# clear 32MB global memory cache (CPU and global memory only)
cache_defeat += 1
# manual pre sync
if isinstance(args[0], Tensor): Device[args[0].device].synchronize()
else: sync()
GlobalCounters.global_ops = 0
GlobalCounters.global_mem = 0
st = time.perf_counter()
ret = f1(*args)
if isinstance(ret, Tensor): Device[ret.device].synchronize()
else: sync()
et = (time.perf_counter() - st) * 1000
if i >= 1: ets.append(et)
if GlobalCounters.global_ops:
save_ops, save_mem = GlobalCounters.global_ops, GlobalCounters.global_mem
return ret.numpy() if isinstance(ret, Tensor) else ret.cpu().numpy(), np.min(ets)
def helper_test_generic_square(name, N, f1, f2, onearg=False):
torch.manual_seed(0)
torch_a = (torch.rand(N, N, dtype=torch_dt) - 0.5).to(torch_device)
torch_b = (torch.rand(N, N, dtype=torch_dt) - 0.5).to(torch_device) if not onearg else None
tiny_a = Tensor(torch_a.cpu().numpy())
tiny_b = Tensor(torch_b.cpu().numpy()) if not onearg else None
helper_test_generic(f"{name:30s} {N:5d}x{N:5d}", f1, (torch_a, torch_b), TinyJit(lambda a,b:f2(a,b).realize()), (tiny_a, tiny_b))
def helper_test_matvec(name, N, M):
torch.manual_seed(0)
torch_a = (torch.rand(N, dtype=torch_dt) - 0.5).to(torch_device)
torch_b = (torch.rand(N, M, dtype=torch_dt) - 0.5).to(torch_device)
tiny_a = Tensor(torch_a.cpu().numpy())
tiny_b = Tensor(torch_b.cpu().numpy())
helper_test_generic(f"{name:30s} {N:5d}x{M:5d}", lambda a,b: a@b, (torch_a, torch_b), TinyJit(lambda a,b:(a@b).realize()), (tiny_a, tiny_b))
prefix = None
def helper_test_generic(name, f1, f1_args, f2, f2_args):
global prefix
with torch.no_grad():
val_torch, et_torch = helper_test_speed(f1, *f1_args)
val_tinygrad, et_tinygrad = helper_test_speed(f2, *f2_args)
desc = "faster" if et_torch > et_tinygrad else "slower"
flops = save_ops*1e-6
mem = save_mem*1e-6
print(("\r" if not CI else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:8.2f} GFLOPS {mem/et_torch:8.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:8.2f} GFLOPS {mem/et_tinygrad:8.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")
np.testing.assert_allclose(val_tinygrad, val_torch, atol=1e-3, rtol=1e-3)
def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_x):
torch.manual_seed(0)
torch_dat = torch.rand(bs, in_chans, img_size_y, img_size_x, dtype=torch_dt).to(torch_device)
torch_conv = torch.nn.Conv2d(in_chans, out_chans, kernel_size, bias=None, dtype=torch_dt).to(torch_device)
tiny_dat = Tensor(torch_dat.cpu().numpy())
tiny_conv = Conv2d(in_chans, out_chans, kernel_size, bias=None)
tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
def f1(torch_dat): return torch_conv(torch_dat)
def f2(tiny_dat): return tiny_conv(tiny_dat).realize()
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
@unittest.skipIf(getenv("BIG") == 0, "no big tests")
class TestBigSpeed(unittest.TestCase):
def test_add(self):
def f(a, b): return a+b
helper_test_generic_square('add', 8192, f, f)
def test_exp(self):
def f(a, b): return a.exp()
helper_test_generic_square('exp', 8192, f, f, onearg=True)
def test_gemm_2048(self):
def f(a, b): return a @ b
helper_test_generic_square('gemm', 2048, f, f)
def test_gemm_4096(self):
def f(a, b): return a @ b
helper_test_generic_square('gemm', 4096, f, f)
def test_large_conv_1x1(self): helper_test_conv(bs=32, in_chans=128, out_chans=128, kernel_size=1, img_size_y=128, img_size_x=128)
def test_large_conv_3x3(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=3, img_size_y=130, img_size_x=130)
def test_large_conv_5x5(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=5, img_size_y=132, img_size_x=132)
def test_matvec_4096_16384(self): helper_test_matvec('matvec_4096_16384', 4096, 16384)
def test_matvec_16384_4096(self): helper_test_matvec('matvec_16384_4096', 16384, 4096)
@unittest.skipIf(getenv("BIG") == 1, "only big tests")
class TestSpeed(unittest.TestCase):
def test_sub(self):
def f(a, b): return a-b
helper_test_generic_square('sub', 4096, f, f)
@unittest.skipIf(CI and Device.DEFAULT == "WEBGPU", "breaking on webgpu CI")
def test_pow(self):
def f(a, b): return a.pow(b)
helper_test_generic_square('pow', 2048, f, f)
def test_sum(self):
def f(a, b): return a.sum()
helper_test_generic_square('sum', 2048, f, f, onearg=True)
helper_test_generic_square('sum', 4096, f, f, onearg=True)
def test_partial_sum(self):
R = 256
def f(a, b): return a.reshape(int(4096//R), int(4096*R)).sum(axis=1)
helper_test_generic_square('partial_sum', 4096, f, f, onearg=True)
@unittest.skip("not really used in models")
def test_cumsum(self):
def f0(a, b): return a.cumsum(axis=0)
def f1(a, b): return a.cumsum(axis=1)
helper_test_generic_square('cumsum_0', 256, f0, f0, onearg=True)
helper_test_generic_square('cumsum_1', 256, f1, f1, onearg=True)
def test_cat(self):
helper_test_generic_square('cat_0', 256, lambda x,y: torch.cat((x,y),dim=0), lambda x,y: x.cat(y,dim=0))
helper_test_generic_square('cat_1', 256, lambda x,y: torch.cat((x,y),dim=1), lambda x,y: x.cat(y,dim=1))
def test_array_packing(self):
N = 2048
def f(a, b): return a.reshape(N, N // 32, 32).permute(1,0,2).contiguous()
helper_test_generic_square('array_packing', N, f, f, onearg=True)
def test_permute(self):
for N in [1024, 4096]:
# this is a 64MB tensor, M1 L1 cache is 128kB
# to fit easily in L1, rotations should be 128x128 chunks. 128x128 is also the AMX size
def f(a, b): return a.permute(1,0).contiguous()
helper_test_generic_square('permute', N, f, f, onearg=True)
def test_double_permute(self):
N = 64
torch.manual_seed(0)
torch_a = (torch.rand(N, N, N, N, dtype=torch_dt) - 0.5).to(torch_device)
tiny_a = Tensor(torch_a.cpu().numpy())
def f(a): return a.permute(1,0,3,2).contiguous()
helper_test_generic(f"double_permute {tiny_a.shape}", f, (torch_a,), TinyJit(lambda a: f(a).realize()), (tiny_a,))
def test_neg(self):
def f(a, b): return -a
helper_test_generic_square('neg', 4096, f, f, onearg=True)
def test_exp(self):
def f(a, b): return a.exp()
helper_test_generic_square('exp', 2048, f, f, onearg=True)
def test_relu(self):
def f(a, b): return a.relu()
helper_test_generic_square('relu', 4096, f, f, onearg=True)
def test_max(self):
def f(a, b): return a.max()
helper_test_generic_square('max', 4096, f, f, onearg=True)
def test_mul_sum(self):
def f(a, b): return (a*b).sum()
helper_test_generic_square('mul_sum', 4096, f, f)
def test_add(self):
for N in [1, 1024, 4096]:
def f(a, b): return a + b
helper_test_generic_square('add', N, f, f)
def test_add_constant(self):
def f(a, b): return a+2.0
helper_test_generic_square('add_constant', 4096, f, f, onearg=True)
def test_add_sq(self):
def f(a, b): return a*a + b*b
helper_test_generic_square('add_sq', 4096, f, f)
def test_gemm(self):
def f(a, b): return a @ b
helper_test_generic_square('gemm', 1024, f, f)
def test_gemm_small(self):
def f(a, b): return a @ b
helper_test_generic_square('gemm', 256, f, f)
def test_gemm_unrolled(self):
N = 512
def f1(a, b): return a@b.T
def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
helper_test_generic_square('gemm_unrolled', N, f1, f2)
def test_gemm_unrolled_permute_l(self):
N = 512
def f1(a, b): return a.T@b.T
def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
helper_test_generic_square('gemm_unrolled_permute_l', N, f1, f2)
def test_gemm_unrolled_permute_r(self):
N = 512
def f1(a, b): return a@b
def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
helper_test_generic_square('gemm_unrolled_permute_r', N, f1, f2)
def test_gemm_unrolled_permute_lr(self):
N = 512
def f1(a, b): return a.T@b
def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
helper_test_generic_square('gemm_unrolled_permute_lr', N, f1, f2)
def test_matvec_1024_1024(self): helper_test_matvec('matvec_1024_1024', 1024, 1024)
def test_matvec_1024_4096(self): helper_test_matvec('matvec_1024_4096', 1024, 4096)
def test_matvec_4096_1024(self): helper_test_matvec('matvec_4096_1024', 4096, 1024)
def test_matvec_4096_4096(self): helper_test_matvec('matvec_4096_4096', 4096, 4096)
def test_openpilot_conv2d(self):
bs, in_chans, out_chans = 1,12,32
torch.manual_seed(0)
torch_dat = torch.rand(bs, 64, 128, 12, dtype=torch_dt).to(torch_device)
torch_conv = torch.nn.Conv2d(in_chans, out_chans, 3, bias=None, padding=1, dtype=torch_dt).to(torch_device)
tiny_dat = Tensor(torch_dat.cpu().numpy())
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
def f1(torch_dat): return torch_conv(torch_dat.permute(0,3,1,2))
def f2(tiny_dat): return tiny_conv(tiny_dat.permute(0,3,1,2)).realize()
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:3", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
def test_conv2d(self):
for bs in [32]:
for in_chans in IN_CHANS:
for out_chans in [32]:
helper_test_conv(bs, in_chans, out_chans, 3, 34, 34)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,181 @@
import unittest
from tinygrad.jit import TinyJit
from tinygrad.helpers import getenv
from tinygrad.shape.symbolic import Variable
from tinygrad.tensor import Tensor, Device
import numpy as np
@unittest.skipIf(getenv("ARM64") or getenv("PTX"), "ARM64 and PTX are not supported")
@unittest.skipUnless(Device.DEFAULT in ["GPU", "METAL", "CLANG", "CUDA", "LLVM"], f"{Device.DEFAULT} is not supported")
class TestSymbolicJit(unittest.TestCase):
def test_plus1(self):
def f(a): return (a+1).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
symbolic = jf(a.reshape(3, vi)).reshape(3, i).numpy()
expected = f(a).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_reshape_inside_plus1(self):
def f(a, jit=False, jit_ctx=None):
if jit: a = a.reshape(3, Variable("i", 1, 10).bind(a.shape[1]))
return (a+1).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10)
a = Tensor.rand(3, i)
symbolic = jf(a, jit=True, jit_ctx={vi: i}).reshape(3, i).numpy()
expected = f(a).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_add(self):
def f(a, b): return (a+b).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
b = Tensor.rand(3, i)
symbolic = jf(a.reshape(3, vi), b.reshape(3, vi)).reshape(3, i).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_matmul(self):
def f(a, b): return (a@b).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
b = Tensor.rand(i, 5)
symbolic = jf(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_mixed_with_no_symbol_kernel(self):
def f(a, b):
s = (a@b).realize()
s = (s+s).realize() # this one does not have symbols in input
return s
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
b = Tensor.rand(i, 5)
symbolic = jf(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 2
def test_attention(self):
def f(q, k, v): return Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
q = Tensor.rand(2, 1, 4, 8)
k = Tensor.rand(2, i, 4, 8)
v = Tensor.rand(2, i, 4, 8)
symbolic = jf(q, k.reshape(2, vi, 4, 8), v.reshape(2, vi, 4, 8)).reshape(2, 4, 1, 8).numpy()
expected = f(q, k, v).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 6
def test_cat_dim0(self):
def f(a, b): return a.cat(b, dim=0).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(i, 3)
b = Tensor.rand(2, 3)
symbolic = jf(a.reshape(vi, 3), b).reshape(i+2, 3).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_cat_dim1(self):
def f(a, b): return a.cat(b, dim=1).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
b = Tensor.rand(3, 2)
symbolic = jf(a.reshape(3, vi), b).reshape(3, i+2).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_cat_dim0_two_vars(self):
def f(a, b): return a.cat(b, dim=0).realize()
jf = TinyJit(f)
for i in range(1, 5):
for j in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
a = Tensor.rand(i, 3)
b = Tensor.rand(j, 3)
symbolic = jf(a.reshape(vi, 3), b.reshape(vj, 3)).reshape(i+j, 3).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_cat_dim1_two_vars(self):
def f(a, b): return a.cat(b, dim=1).realize()
jf = TinyJit(f)
for i in range(1, 5):
for j in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
a = Tensor.rand(3, i)
b = Tensor.rand(3, j)
symbolic = jf(a.reshape(3, vi), b.reshape(3, vj)).reshape(3, i+j).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_two_vars_plus1(self):
def f(a, b): return (a@b+1).realize()
jf = TinyJit(f)
for i in range(1, 5):
for j in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
a = Tensor.rand(i, 3)
b = Tensor.rand(3, j)
symbolic = jf(a.reshape(vi, 3), b.reshape(3, vj)).reshape(i, j).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
def test_jit_symbolic_shape_mismatch(self):
@TinyJit
def add(a, b): return (a+b).realize()
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i).reshape(3, vi)
b = Tensor.rand(3, i).reshape(3, vi)
c = add(a, b)
vi2 = Variable("i", 1, 10).bind(7)
a = Tensor.rand(3, 7).reshape(3, vi2)
bad = Tensor.rand(4, 7).reshape(4, vi2)
with self.assertRaises(AssertionError):
add(a, bad)
def test_shrink(self):
# shrink is a movement, so we pair it with a simple function to test the JIT interaction
def f(a): return (a+1).realize()
jf = TinyJit(f)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(7, 11)
symbolic = a.shrink(((3,5),(vi,vi+2)))
symbolic = jf(symbolic).numpy()
expected = f(a.shrink(((3,5),(i,i+2)))).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert len(jf.jit_cache) == 1
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,124 @@
import unittest
from tinygrad.jit import JIT_SUPPORTED_DEVICE
from tinygrad.shape.symbolic import Variable
from tinygrad.helpers import getenv
from tinygrad.tensor import Tensor, Device
import numpy as np
@unittest.skipIf(getenv("ARM64") or getenv("PTX"), "ARM64 and PTX are not supported")
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["HIP", "WEBGPU"], f"{Device.DEFAULT} is not supported")
class TestSymbolicOps(unittest.TestCase):
def test_plus1(self):
def f(a): return (a+1).realize()
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
symbolic = f(a.reshape(3, vi)).reshape(3, i).numpy()
expected = f(a).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_add(self):
def f(a, b): return (a+b).realize()
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
b = Tensor.rand(3, i)
symbolic = f(a.reshape(3, vi), b.reshape(3, vi)).reshape(3, i).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_matmul(self):
def f(a, b): return (a@b).realize()
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
b = Tensor.rand(i, 5)
symbolic = f(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_attention(self, dropout_p=0.0):
def f(q, k, v): return Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p).realize()
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
q = Tensor.rand(2, 1, 4, 8)
k = Tensor.rand(2, i, 4, 8)
v = Tensor.rand(2, i, 4, 8)
symbolic = f(q, k.reshape(2, vi, 4, 8), v.reshape(2, vi, 4, 8)).reshape(2, 4, 1, 8).numpy()
expected = f(q, k, v).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_attention_training(self):
with Tensor.train():
self.test_attention(dropout_p=0.0)
with self.assertRaises(AssertionError):
# symbolic shape dropout is not supported
self.test_attention(dropout_p=0.5)
def test_cat_dim0(self):
def f(a, b): return a.cat(b, dim=0).realize()
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(i, 3)
b = Tensor.rand(2, 3)
symbolic = f(a.reshape(vi, 3), b).reshape(i+2, 3).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_cat_dim1(self):
def f(a, b): return a.cat(b, dim=1).realize()
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(3, i)
b = Tensor.rand(3, 2)
symbolic = f(a.reshape(3, vi), b).reshape(3, i+2).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_cat_dim0_two_vars(self):
def f(a, b): return a.cat(b, dim=0).realize()
for i in range(1, 5):
for j in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
a = Tensor.rand(i, 3)
b = Tensor.rand(j, 3)
symbolic = f(a.reshape(vi, 3), b.reshape(vj, 3)).reshape(i+j, 3).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_cat_dim1_two_vars(self):
def f(a, b): return a.cat(b, dim=1).realize()
for i in range(1, 5):
for j in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
a = Tensor.rand(3, i)
b = Tensor.rand(3, j)
symbolic = f(a.reshape(3, vi), b.reshape(3, vj)).reshape(3, i+j).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_two_vars_plus1(self):
def f(a, b): return (a@b+1).realize()
for i in range(1, 5):
for j in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
a = Tensor.rand(i, 3)
b = Tensor.rand(3, j)
symbolic = f(a.reshape(vi, 3), b.reshape(3, vj)).reshape(i, j).numpy()
expected = f(a, b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_shrink(self):
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(7, 11)
symbolic = a.shrink(((3,5),(vi,vi+2)))
symbolic = symbolic.numpy()
expected = a.shrink(((3,5),(i,i+2))).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,173 @@
import unittest
from tinygrad.shape.shapetracker import ShapeTracker, View
from tinygrad.shape.symbolic import Variable
from tinygrad.tensor import Tensor
class TestSymbolic(unittest.TestCase):
def test_symbolic_st(self):
x = Variable("x", 1, 100)
st = ShapeTracker.from_shape((x, 3))
assert st.shape == (x, 3)
assert st.real_strides() == (3, 1)
def test_expr_idxs(self):
x = Variable("x", 1, 100)
st = ShapeTracker.from_shape((x, 3))
idxs = [Variable("x", 0, 100), Variable("y", 0, 100)]
e1, e2 = st.expr_idxs(idxs)
assert e1.render() == "((x*3)+y)"
assert e2.render() == "1"
st = st.permute((1, 0))
e1, e2 = st.expr_idxs(idxs)
assert e1.render() == "((y*3)+x)"
assert e2.render() == "1"
def test_cat_dim0_strides(self):
i = Variable("i", 1, 5).bind(3)
j = Variable("j", 1, 5).bind(3)
k = Variable("k", 1, 5).bind(3)
t = Tensor.rand(3, 4).reshape(i, 4).cat(Tensor.rand(3, 4).reshape(j, 4), dim=0).cat(Tensor.rand(3, 4).reshape(k, 4), dim=0)
st = t.lazydata.st
assert st.shape == (i+j+k, 4)
assert st.real_strides() == (4, 1)
t = Tensor.rand(3, 3).reshape(i, 3).cat(Tensor.rand(3, 3).reshape(i, 3), dim=0).cat(Tensor.rand(3, 3), dim=0)
st = t.lazydata.st
assert st.shape == (2*i+3, 3)
assert st.real_strides() == (3, 1)
def test_cat_dim1_strides(self):
i = Variable("i", 1, 5).bind(4)
j = Variable("j", 1, 5).bind(4)
k = Variable("k", 1, 5).bind(4)
t = Tensor.rand(3, 4).reshape(3, i).cat(Tensor.rand(3, 4).reshape(3, j), dim=1).cat(Tensor.rand(3, 4).reshape(3, k), dim=1)
st = t.lazydata.st
assert st.shape == (3, i+j+k)
assert st.real_strides() == (i+j+k, 1)
class TestSymbolicVarVals(unittest.TestCase):
def test_var_vals_empty(self):
assert ShapeTracker.from_shape((3, 4, 5)).var_vals == {}
def test_var_vals_shape(self):
x = Variable("x", 1, 100).bind(3)
assert ShapeTracker.from_shape((x, 3)).var_vals == {Variable("x", 1, 100): 3}
def test_var_vals_offset(self):
x = Variable("x", 1, 100).bind(3)
st = ShapeTracker.from_shape((4, 3)).shrink(((x, x+1), (0, 3)))
assert st.real_offset() == x * 3
assert st.var_vals == {Variable("x", 1, 100): 3}
def test_var_vals_mask(self):
x = Variable("x", 1, 100).bind(3)
view = View.create(shape=(3,4), strides=(4,1), offset=0, mask=((0, x), (0, 4)))
st = ShapeTracker(views=(view,))
assert st.var_vals == {Variable("x", 1, 100): 3}
def test_var_vals_complex(self):
x = Variable("x", 1, 100).bind(3)
y = Variable("y", 1, 100).bind(4)
z = Variable("z", 1, 100).bind(5)
st = ShapeTracker.from_shape((x, 5, y)).shrink(((0, x), (z, z+1), (0, 3)))
assert st.real_offset() == y * z
assert st.var_vals == {Variable("x", 1, 100): 3, Variable("y", 1, 100):4, Variable("z", 1, 100): 5}
def test_shrink_reshape(self):
x = Variable("x", 1, 100).bind(3)
st = ShapeTracker.from_shape((10, 10, 10)).shrink(((x, x+3), (3, 7), (2, 5)))
st = st.reshape((3*4*3,))
assert st.var_vals == {Variable("x", 1, 100): 3}
class TestShapeTrackerUnbind(unittest.TestCase):
def test_view_unbind(self):
v = Variable("v", 1, 100)
bv = Variable("v", 1, 100).bind(3)
assert View.create(shape=(bv, 4)).unbind() == View.create(shape=(v, 4))
def test_reshape_unbind(self):
v = Variable("v", 1, 100)
bv = Variable("v", 1, 100).bind(3)
t = Tensor.rand(3, 4).reshape(bv, 4)
assert t.lazydata.st.unbind() == ShapeTracker((View.create(shape=(v, 4)),))
def test_shrink_unbind(self):
v = Variable("v", 1, 100)
bv = Variable("v", 1, 100).bind(2)
t = Tensor.rand(3, 4).shrink(((bv, bv+1), (0, 4)))
assert t.lazydata.st.unbind() == ShapeTracker((View.create(shape=(1, 4), offset=4*v),))
class TestSymbolicReshape(unittest.TestCase):
def test_reshape_into_symbols_simple(self):
for i in range(1, 6):
vi = Variable("i", 1, 5).bind(i)
t = Tensor.rand(i, 4).reshape(vi, 4)
assert t.shape == (vi, 4)
t = Tensor.rand(i, 6).reshape(vi, 2, 3)
assert t.shape == (vi, 2, 3)
def test_reshape_symbols_reshape_ints(self):
for i in range(1, 6):
vi = Variable("i", 1, 5).bind(i)
t = Tensor.rand(i, 4).reshape(vi, 4)
assert t.shape == (vi, 4)
t = t.reshape(i, 4)
assert t.shape == (i, 4)
def test_reshape_into_symbols_bad_shape(self):
vi = Variable("i", 1, 10).bind(4)
with self.assertRaises(AssertionError):
t = Tensor.rand(4, 6).reshape(vi, 6).reshape(1, 77) # reshape to a different size new shape through symbolic shape
with self.assertRaises(AssertionError):
t = Tensor.rand(3, 4).reshape(3, (vi+1)) # reshape into non-Variable Node
def test_two_symbol_reshape(self):
for i in range(1, 6):
for j in range(1, 6):
vi = Variable("i", 1, 5).bind(i)
vj = Variable("j", 1, 5).bind(j)
t = Tensor.rand(i, j).reshape(vi, vj)
assert t.shape == (vi, vj)
# NOTE: this is currently not allowed
# t = t.reshape(1, vi*vj)
# assert t.shape == (1, vi*vj)
t = t.reshape(vj, vi)
assert t.shape == (vj, vi)
class TestSymbolicExpand(unittest.TestCase):
def test_expand_into_symbols(self):
# TODO: enfore expand only into bound variables
vi = Variable("i", 1, 5)
vj = Variable("j", 1, 5)
a = Tensor([[1], [2], [3]]).expand((3, vi))
assert a.shape == (3, vi)
a = a.reshape(3, vi, 1).expand((3, vi, vj))
assert a.shape == (3, vi, vj)
def test_plus_expands_constant(self):
for i in range(1, 6):
vi = Variable("i", 1, 5).bind(i)
a = Tensor.rand(3, i).reshape(3, vi)
a = a + 1
assert a.shape == (3, vi)
class TestSymbolicShrink(unittest.TestCase):
def test_shrink_symbols(self):
vi = Variable("i", 1, 5)
t = Tensor.rand(3, 5).shrink(((0, 2), (vi, vi+1)))
assert t.shape == (2, 1)
class TestSymbolicShapeExpr(unittest.TestCase):
def test_symbolic_expr_idxs(self):
# taken from symbolic shape llama
i = Variable("i", 1, 120)
gidx0 = Variable("gidx0", 0, i)
lidx1 = Variable("lidx1", 0, 7)
idx = (gidx0, lidx1, Variable.num(1))
shape = (i+1, 8, 4)
strides = (1, (i*4)+4, i+1)
st = ShapeTracker((View.create(shape, strides), ))
idx, valid = st.expr_idxs(idx)
assert idx.render() == "((lidx1*((i*4)+4))+1+gidx0+i)"
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,266 @@
import numpy as np
import torch
import struct
import unittest, copy
import mmap
from tinygrad.tensor import Tensor, Device
from tinygrad.helpers import dtypes
from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
from extra.utils import temp
x_init = np.random.randn(1,3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(3,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)
class TestTinygrad(unittest.TestCase):
def test_zerodim_initialization(self):
a = Tensor(55)
b = Tensor(3.14)
self.assertEqual(a.shape, ())
self.assertEqual(b.shape, ())
def test_plus_equals(self):
a = Tensor.randn(10,10)
b = Tensor.randn(10,10)
c = a + b
val1 = c.numpy()
a += b
val2 = a.numpy()
np.testing.assert_allclose(val1, val2)
def test_backward_pass(self):
def test_tinygrad():
x = Tensor(x_init, requires_grad=True)
W = Tensor(W_init, requires_grad=True)
m = Tensor(m_init)
out = x.dot(W).relu()
out = out.log_softmax()
out = out.mul(m).add(m).sum()
out.backward()
return out.numpy(), x.grad.numpy(), W.grad.numpy()
def test_pytorch():
x = torch.tensor(x_init, requires_grad=True)
W = torch.tensor(W_init, requires_grad=True)
m = torch.tensor(m_init)
out = x.matmul(W).relu()
out = torch.nn.functional.log_softmax(out, dim=1)
out = out.mul(m).add(m).sum()
out.backward()
return out.detach().numpy(), x.grad, W.grad
for x,y in zip(test_tinygrad(), test_pytorch()):
np.testing.assert_allclose(x, y, atol=1e-5)
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "this test uses more than 8 bufs which breaks webgpu") #TODO: remove after #1461
def test_backward_pass_diamond_model(self):
def test_tinygrad():
u = Tensor(U_init, requires_grad=True)
v = Tensor(V_init, requires_grad=True)
w = Tensor(W_init, requires_grad=True)
x = u.mul(v).relu()
y = u.mul(w).relu()
out = x.add(y).mul(y).relu()
out = out.log_softmax()
out = out.sum()
out.backward()
return out.numpy(), u.grad.numpy(), v.grad.numpy(), w.grad.numpy()
def test_pytorch():
u = torch.tensor(U_init, requires_grad=True)
v = torch.tensor(V_init, requires_grad=True)
w = torch.tensor(W_init, requires_grad=True)
x = u.mul(v).relu()
y = u.mul(w).relu()
out = x.add(y).mul(y).relu()
out = torch.nn.functional.log_softmax(out, dim=1)
out = out.sum()
out.backward()
return out.detach().numpy(), u.grad, v.grad, w.grad
for x,y in zip(test_tinygrad(), test_pytorch()):
np.testing.assert_allclose(x, y, atol=1e-5)
def test_nograd(self):
x = Tensor(x_init, requires_grad=False)
m = Tensor(m_init, requires_grad=False)
W = Tensor(W_init, requires_grad=True)
tmp = x.mul(m)
mm = tmp.matmul(W)
out = mm.relu()
out = out.sum()
out.backward()
assert x.grad is None
assert m.grad is None
assert tmp.grad is None
assert mm.grad is not None
assert W.grad is not None
def test_dropout(self):
with Tensor.train():
n, rate = 1_000_000, 0.1
w = Tensor.ones(n).dropout(rate)
non_zeros = np.count_nonzero(w.numpy())
expected = n * (1 - rate)
np.testing.assert_allclose(non_zeros, expected, rtol=2e-3)
def test_jacobian(self):
W = np.random.RandomState(42069).random((10, 5)).astype(np.float32)
x = np.random.RandomState(69420).random((1, 10)).astype(np.float32)
torch_x = torch.tensor(x, requires_grad=True)
torch_W = torch.tensor(W, requires_grad=True)
torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()
tiny_x = Tensor(x, requires_grad=True)
tiny_W = Tensor(W, requires_grad=True)
tiny_func = lambda x: x.dot(tiny_W).relu().log_softmax()
J = jacobian(tiny_func, tiny_x)
NJ = numerical_jacobian(tiny_func, tiny_x)
np.testing.assert_allclose(PJ, J, atol = 1e-5)
np.testing.assert_allclose(PJ, NJ, atol = 1e-3)
def test_gradcheck(self):
W = np.random.RandomState(1337).random((10, 5)).astype(np.float32)
x = np.random.RandomState(7331).random((1, 10)).astype(np.float32)
tiny_x = Tensor(x, requires_grad=True)
tiny_W = Tensor(W, requires_grad=True)
tiny_func = lambda x: x.dot(tiny_W).relu().log_softmax()
self.assertTrue(gradcheck(tiny_func, tiny_x, eps = 1e-3))
# coarse approx. since a "big" eps and the non-linearities of the model
self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 1e-5))
def test_random_fns_are_deterministic_with_seed(self):
for random_fn in [Tensor.randn, Tensor.normal, Tensor.uniform, Tensor.scaled_uniform, Tensor.glorot_uniform, Tensor.kaiming_normal]:
with self.subTest(msg=f"Tensor.{random_fn.__name__}"):
Tensor.manual_seed(1337)
a = random_fn(10,10).realize()
Tensor.manual_seed(1337)
b = random_fn(10,10).realize()
np.testing.assert_allclose(a.numpy(), b.numpy())
def test_randn_isnt_inf_on_zero(self):
# simulate failure case of rand handing a zero to randn
original_rand, Tensor.rand = Tensor.rand, Tensor.zeros
try: self.assertNotIn(np.inf, Tensor.randn(16).numpy())
except: raise
finally: Tensor.rand = original_rand
def test_zeros_like_has_same_dtype(self):
for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
a = Tensor([1, 2, 3], dtype=datatype)
b = Tensor.zeros_like(a)
assert a.dtype == b.dtype, f"a.dtype and b.dtype should be {datatype}"
assert a.shape == b.shape, f"shape mismatch (Tensor.zeros_like){a.shape} != (torch){b.shape}"
a = Tensor([1, 2, 3])
b = Tensor.zeros_like(a, dtype=dtypes.int8)
assert a.dtype != b.dtype and a.dtype == dtypes.float32 and b.dtype == dtypes.int8, "a.dtype should be float and b.dtype should be char"
assert a.shape == b.shape, f"shape mismatch (Tensor.zeros_like){a.shape} != (torch){b.shape}"
def test_ones_like_has_same_dtype_and_shape(self):
for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
a = Tensor([1, 2, 3], dtype=datatype)
b = Tensor.ones_like(a)
assert a.dtype == b.dtype, f"a.dtype and b.dtype should be {datatype}"
assert a.shape == b.shape, f"shape mismatch (Tensor.ones_like){a.shape} != (torch){b.shape}"
a = Tensor([1, 2, 3])
b = Tensor.ones_like(a, dtype=dtypes.int8)
assert a.dtype != b.dtype and a.dtype == dtypes.float32 and b.dtype == dtypes.int8, "a.dtype should be float and b.dtype should be char"
assert a.shape == b.shape, f"shape mismatch (Tensor.ones_like){a.shape} != (torch){b.shape}"
def test_ndim(self):
assert Tensor.randn(1).ndim == 1
assert Tensor.randn(2,2,2).ndim == 3
assert Tensor.randn(1,1,1,1,1,1).ndim == 6
def test_argfix(self):
self.assertEqual(Tensor.zeros().shape, ())
self.assertEqual(Tensor.ones().shape, ())
self.assertEqual(Tensor.zeros([]).shape, ())
self.assertEqual(Tensor.ones([]).shape, ())
self.assertEqual(Tensor.zeros(tuple()).shape, ())
self.assertEqual(Tensor.ones(tuple()).shape, ())
self.assertEqual(Tensor.zeros(1).shape, (1,))
self.assertEqual(Tensor.ones(1).shape, (1,))
self.assertEqual(Tensor.zeros(1,10,20).shape, (1,10,20))
self.assertEqual(Tensor.ones(1,10,20).shape, (1,10,20))
self.assertEqual(Tensor.zeros([1]).shape, (1,))
self.assertEqual(Tensor.ones([1]).shape, (1,))
self.assertEqual(Tensor.zeros([10,20,40]).shape, (10,20,40))
self.assertEqual(Tensor.ones([10,20,40]).shape, (10,20,40))
def test_numel(self):
assert Tensor.randn(10, 10).numel() == 100
assert Tensor.randn(1,2,5).numel() == 10
assert Tensor.randn(1,1,1,1,1,1).numel() == 1
assert Tensor([]).numel() == 0
# assert Tensor.randn(1,0,2,5) == 0 # TODO: fix empty tensors
def test_element_size(self):
for _, dtype in dtypes.fields().items():
assert dtype.itemsize == Tensor.randn(3, dtype=dtype).element_size(), f"Tensor.element_size() not matching Tensor.dtype.itemsize for {dtype}"
def test_deepwalk_ctx_check(self):
layer = Tensor.uniform(1, 1, requires_grad=True)
x = Tensor.randn(1, 1, 1)
x.dot(layer).mean().backward()
x = Tensor.randn(1, 1, 1)
x.dot(layer).mean().backward()
def test_zerosized_tensors(self):
Tensor([]).realize()
Tensor([]).numpy()
def test_tensor_ndarray_dtype(self):
arr = np.array([1]) # where dtype is implicitly int64
assert Tensor(arr).dtype == dtypes.int64
assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32 # check if ndarray correctly casts to Tensor dtype
assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64 # check that it works for something else
def test_tensor_list_dtype(self):
arr = [1]
assert Tensor(arr).dtype == Tensor.default_type
assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32
assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64
def test_tensor_copy(self):
x = copy.deepcopy(Tensor.ones((3,3,3)))
np.testing.assert_allclose(x.numpy(), np.ones((3,3,3)))
def test_copy_from_disk(self):
t = Tensor.randn(30, device="CPU").to(f"disk:{temp('test_copy_from_disk')}")
a = t[10:20]
dev = a.to(Device.DEFAULT)
np.testing.assert_allclose(a.numpy(), dev.numpy())
# Regression test for https://github.com/tinygrad/tinygrad/issues/1751
def test_copy_from_numpy_unaligned(self):
# 2**15 is the minimum for repro
arr = np.random.randn(2**15).astype(dtypes.float.np)
fn = temp('test_copy_from_numpy_unaligned')
with open(fn, 'wb') as f: f.write(b't' + arr.tobytes())
with open(fn, "a+b") as f: memview = memoryview(mmap.mmap(f.fileno(), arr.nbytes + 1))
ua_arr = np.frombuffer(memview[1:], dtype=arr.dtype, count=arr.shape[0])
np.testing.assert_allclose(arr, ua_arr)
assert not ua_arr.flags.aligned
# force device copy - to() is opt'd away - Tensor(dev)/1 is ignored
np.testing.assert_allclose(ua_arr, (Tensor(ua_arr)/Tensor(1)).numpy())
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,99 @@
from typing import Optional, Tuple, Any, List
import unittest, math
import numpy as np
from tinygrad.helpers import dtypes, getenv, DType, PtrDType
from tinygrad.tensor import Device
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ASTRunner, Compiled
from tinygrad.codegen.linearizer import UOps, UOp
def _uops_to_prg(uops):
src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
return ASTRunner("test", src,
[1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
uops.append(UOp(uop, dtype, tuple(vin), arg, len(uops)))
return uops[-1]
def _test_single_value(vals, op, dtype):
uops = []
buf_store = uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), ('data0', dtype))
buf_loads = [uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), (f'data{i+1}', dtype)) for i in range(len(vals))]
loads = (uop(uops, UOps.LOAD, dtype, [buf_loads[i], uop(uops, UOps.CONST, dtypes.int32, (), 0)]) for i in range(len(vals)))
alu = uop(uops, UOps.ALU, dtype, loads, op)
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
buf = Device[Device.DEFAULT].buffer(1, dtype)
buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
prg = _uops_to_prg(uops)
prg([buf]+buf2)
return buf.toCPU()[0]
def _test_single_value_const(vals, op, dtype):
uops = []
buf_store = uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), ('data0', dtype))
loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
alu = uop(uops, UOps.ALU, dtype, loads, op)
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
buf = Device[Device.DEFAULT].buffer(1, dtype)
prg = _uops_to_prg(uops)
prg([buf])
return buf.toCPU()[0]
class TestUOps(unittest.TestCase):
def _equal(self, v1, v2):
if not (math.isnan(v1) and math.isnan(v2)): self.assertAlmostEqual(v1, v2, places=5)
def _test_uop_fxn(self, bop, fxn, dt=dtypes.float32):
for f in [_test_single_value, _test_single_value_const]:
for a in [-2.0, 0.0, 1.0]:
self._equal(f([a], bop, dt), fxn(a))
def _test_bop_fxn(self, bop, fxn, dt=dtypes.float32, no_b_zero=False):
for f in [_test_single_value, _test_single_value_const]:
for a in [-2.0, 0.0, 1.0]:
for b in [-3.0, 1.0] + ([] if no_b_zero else [0.0]):
self._equal(f([a,b], bop, dt), fxn(a,b))
def _test_top_fxn(self, bop, fxn, dt=dtypes.float32):
for f in [_test_single_value, _test_single_value_const]:
for a in [-2.0, 0, 1]:
for b in [-3.0, 3.0]:
for c in [-4.0, 4.0]:
self._equal(f([a,b,c], bop, dt), fxn(a,b,c))
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
class TestFloatUOps(TestUOps):
def test_neg(self): self._test_uop_fxn(UnaryOps.NEG, lambda a: -a)
def test_exp2(self): self._test_uop_fxn(UnaryOps.EXP2, lambda a: np.exp2(a))
def test_log2(self): self._test_uop_fxn(UnaryOps.LOG2, lambda a: math.log2(a) if a > 0 else float('-inf' if a==0 else 'nan'))
def test_sin(self): self._test_uop_fxn(UnaryOps.SIN, lambda a: math.sin(a))
def test_sqrt(self): self._test_uop_fxn(UnaryOps.SQRT, lambda a: math.sqrt(a) if a >= 0 else float('nan'))
# this is not on most backends
#def test_recip(self): self._test_uop_fxn(UnaryOps.RECIP, lambda a: 1.0/a if a != 0 else float('inf'))
def test_add(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: a+b)
def test_sub(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: a-b)
def test_mul(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: a*b)
def test_div(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: a/b if b != 0 else a*float('inf'))
def test_max(self): self._test_bop_fxn(BinaryOps.MAX, lambda a,b: max(a,b))
def test_cmplt(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b))
# MOD isn't tested on floats
def test_mulacc(self): self._test_top_fxn(TernaryOps.MULACC, lambda a,b,c: (a*b)+c)
def test_where(self): self._test_top_fxn(TernaryOps.WHERE, lambda a,b,c: b if a!=0 else c)
# TODO: fix this on all the backends
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or getenv('ARM64', False), "only test for compiled backends, broken on some")
class TestNonFloatUOps(TestUOps):
def test_neg_int32(self): self._test_uop_fxn(UnaryOps.NEG, lambda a: -a, dtypes.int32)
def test_add_int32(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: int(a)+int(b), dtypes.int32)
def test_sub_int32(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: int(a)-int(b), dtypes.int32)
def test_mul_int32(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: int(a)*int(b), dtypes.int32)
def test_div_int32(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: int(a/b), dtypes.int32, no_b_zero=True)
def test_mod_int32(self): self._test_bop_fxn(BinaryOps.MOD, lambda a,b: abs(int(a))%abs(int(b))*(1,-1)[a<0], dtypes.int32, no_b_zero=True)
def test_cmplt_int32(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b), dtypes.int32)
def test_mul_bool(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: bool(a) and bool(b), dtypes.bool)
if __name__ == '__main__':
unittest.main(verbosity=2)

View File

@@ -0,0 +1,51 @@
const puppeteer = require('puppeteer');
const { spawn } = require('child_process');
const res = spawn("python", ["-m", "http.server", "8000"], { shell: true });
async function timeout(time) {
return new Promise((resolve) => setTimeout(resolve, time));
}
function cleanup(err) {
res.kill();
if(err != null) {
console.error(err);
process.exit(1);
}
}
async function waitForText(selector, text) {
let n = 0;
let ready = false;
while (n < 10) {
const res = await (await selector.getProperty("textContent")).jsonValue();
console.log(`waiting for text ${text} got ${res}`);
if(res == text) {
ready = true;
break
}
await timeout(2000);
n += 1
}
return ready;
}
puppeteer.launch({ headless: false, args: ["--enable-unsafe-webgpu"]}).then(async browser => {
const page = await browser.newPage();
page.on("console", message => console.log(`message from console ${message.text()}`))
.on("pageerror", ({ message }) => console.log(`error from page ${message}`))
const res = await page.goto("http://localhost:8000/examples/index.html");
if(res.status() != 200) throw new Error("Failed to load page");
const textSelector = await page.waitForSelector("#result");
const buttonSelector = await page.waitForSelector("input[type=button]");
const ready = await waitForText(textSelector, "ready");
if(!ready) throw new Error("Failed to load page");
await buttonSelector.evaluate(e => e.click());
const done = await waitForText(textSelector, "hen");
if(!done) throw new Error("failed to get hen");
browser.close();
cleanup(null);
}).catch(err => {
cleanup(err);
});

View File

@@ -0,0 +1,40 @@
import unittest
from tinygrad.helpers import Timing, CI
from tinygrad.tensor import Tensor
from tinygrad.ops import LoadOps
from tinygrad.codegen.linearizer import Linearizer
from test.test_net_speed import start_profile, stop_profile
class TestWinograd(unittest.TestCase):
def setUp(self):
self.old = Tensor.wino
Tensor.wino = 1
def tearDown(self): Tensor.wino = self.old
def test_speed(self):
x = Tensor.empty(1,4,9,9)
w = Tensor.empty(4,4,3,3)
with Timing("running conv: "):
out = Tensor.conv2d(x, w)
with Timing("scheduling: "):
sched = out.lazydata.schedule()
for i,s in enumerate(sched):
if s.ast.op in LoadOps: continue
ops = s.ast.get_lazyops()
with Timing(f"linearize {i} with {len(ops):4d} ops: "):
l = Linearizer(s.ast)
l.hand_coded_optimizations()
l.linearize()
def test_profile(self):
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
if not CI: pr = start_profile()
out = Tensor.conv2d(x,w).realize()
if not CI: stop_profile(pr, sort='time')
out.numpy()
if __name__ == '__main__':
unittest.main(verbosity=2)

View File

@@ -0,0 +1,66 @@
import unittest
import pickle
from tinygrad.helpers import diskcache_get, diskcache_put
def remote_get(table,q,k): q.put(diskcache_get(table, k))
def remote_put(table,k,v): diskcache_put(table, k, v)
class DiskCache(unittest.TestCase):
def test_putget(self):
table = "test_putget"
diskcache_put(table, "hello", "world")
self.assertEqual(diskcache_get(table, "hello"), "world")
diskcache_put(table, "hello", "world2")
self.assertEqual(diskcache_get(table, "hello"), "world2")
def test_putcomplex(self):
table = "test_putcomplex"
diskcache_put(table, "k", ("complex", 123, "object"))
ret = diskcache_get(table, "k")
self.assertEqual(ret, ("complex", 123, "object"))
def test_getotherprocess(self):
table = "test_getotherprocess"
from multiprocessing import Process, Queue
diskcache_put(table, "k", "getme")
q = Queue()
p = Process(target=remote_get, args=(table,q,"k"))
p.start()
p.join()
self.assertEqual(q.get(), "getme")
def test_putotherprocess(self):
table = "test_putotherprocess"
from multiprocessing import Process
p = Process(target=remote_put, args=(table,"k", "remote"))
p.start()
p.join()
self.assertEqual(diskcache_get(table, "k"), "remote")
def test_no_table(self):
self.assertIsNone(diskcache_get("faketable", "k"))
def test_ret(self):
table = "test_ret"
self.assertEqual(diskcache_put(table, "key", ("vvs",)), ("vvs",))
def test_non_str_key(self):
table = "test_non_str_key"
diskcache_put(table, 4, 5)
self.assertEqual(diskcache_get(table, 4), 5)
self.assertEqual(diskcache_get(table, "4"), 5)
def test_dict_key(self):
table = "test_dict_key"
fancy_key = {"hello": "world", "goodbye": 7, "good": True, "pkl": pickle.dumps("cat")}
fancy_key2 = {"hello": "world", "goodbye": 8, "good": True, "pkl": pickle.dumps("cat")}
fancy_key3 = {"hello": "world", "goodbye": 8, "good": True, "pkl": pickle.dumps("dog")}
diskcache_put(table, fancy_key, 5)
self.assertEqual(diskcache_get(table, fancy_key), 5)
diskcache_put(table, fancy_key2, 8)
self.assertEqual(diskcache_get(table, fancy_key2), 8)
self.assertEqual(diskcache_get(table, fancy_key), 5)
self.assertEqual(diskcache_get(table, fancy_key3), None)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,150 @@
import pathlib
import unittest
import numpy as np
from tinygrad.tensor import Tensor, Device
from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
from tinygrad.helpers import dtypes
from tinygrad.runtime.ops_disk import RawDiskBuffer
from tinygrad.helpers import Timing
from extra.utils import fetch_as_file, temp
def compare_weights_both(url):
import torch
fn = fetch_as_file(url)
tg_weights = get_state_dict(torch_load(fn))
torch_weights = get_state_dict(torch.load(fn), tensor_type=torch.Tensor)
assert list(tg_weights.keys()) == list(torch_weights.keys())
for k in tg_weights:
np.testing.assert_equal(tg_weights[k].numpy(), torch_weights[k].numpy(), err_msg=f"mismatch at {k}, {tg_weights[k].shape}")
print(f"compared {len(tg_weights)} weights")
class TestTorchLoad(unittest.TestCase):
# pytorch pkl format
def test_load_enet(self): compare_weights_both("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth")
# pytorch zip format
def test_load_enet_alt(self): compare_weights_both("https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth")
# pytorch zip format
def test_load_convnext(self): compare_weights_both('https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth')
# TODO: support pytorch tar format with minimal lines
#def test_load_resnet(self): compare_weights_both('https://download.pytorch.org/models/resnet50-19c8e357.pth')
test_fn = pathlib.Path(__file__).parents[2] / "weights/LLaMA/7B/consolidated.00.pth"
#test_size = test_fn.stat().st_size
test_size = 1024*1024*1024*2
# sudo su -c 'sync; echo 1 > /proc/sys/vm/drop_caches' && python3 test/unit/test_disk_tensor.py TestRawDiskBuffer.test_readinto_read_speed
@unittest.skipIf(not test_fn.exists(), "download LLaMA weights for read in speed tests")
class TestRawDiskBuffer(unittest.TestCase):
def test_readinto_read_speed(self):
tst = np.empty(test_size, np.uint8)
with open(test_fn, "rb") as f:
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
f.readinto(tst)
def test_mmap_read_speed(self):
db = RawDiskBuffer(test_size, dtype=dtypes.uint8, device=test_fn)
tst = np.empty(test_size, np.uint8)
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
np.copyto(tst, db.toCPU())
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu doesn't support uint8 datatype")
class TestSafetensors(unittest.TestCase):
def test_real_safetensors(self):
import torch
from safetensors.torch import save_file
torch.manual_seed(1337)
tensors = {
"weight1": torch.randn((16, 16)),
"weight2": torch.arange(0, 17, dtype=torch.uint8),
"weight3": torch.arange(0, 17, dtype=torch.int32).reshape(17,1,1),
"weight4": torch.arange(0, 2, dtype=torch.uint8),
}
save_file(tensors, temp("model.safetensors"))
ret = safe_load(temp("model.safetensors"))
for k,v in tensors.items(): np.testing.assert_array_equal(ret[k].numpy(), v.numpy())
safe_save(ret, temp("model.safetensors_alt"))
with open(temp("model.safetensors"), "rb") as f:
with open(temp("model.safetensors_alt"), "rb") as g:
assert f.read() == g.read()
ret2 = safe_load(temp("model.safetensors_alt"))
for k,v in tensors.items(): np.testing.assert_array_equal(ret2[k].numpy(), v.numpy())
def test_efficientnet_safetensors(self):
from models.efficientnet import EfficientNet
model = EfficientNet(0)
state_dict = get_state_dict(model)
safe_save(state_dict, temp("eff0"))
state_dict_loaded = safe_load(temp("eff0"))
assert sorted(list(state_dict_loaded.keys())) == sorted(list(state_dict.keys()))
for k,v in state_dict.items():
np.testing.assert_array_equal(v.numpy(), state_dict_loaded[k].numpy())
# load with the real safetensors
from safetensors import safe_open
with safe_open(temp("eff0"), framework="pt", device="cpu") as f:
assert sorted(list(f.keys())) == sorted(list(state_dict.keys()))
for k in f.keys():
np.testing.assert_array_equal(f.get_tensor(k).numpy(), state_dict[k].numpy())
def test_huggingface_enet_safetensors(self):
# test a real file
fn = fetch_as_file("https://huggingface.co/timm/mobilenetv3_small_075.lamb_in1k/resolve/main/model.safetensors")
state_dict = safe_load(fn)
assert len(state_dict.keys()) == 244
assert 'blocks.2.2.se.conv_reduce.weight' in state_dict
assert state_dict['blocks.0.0.bn1.num_batches_tracked'].numpy() == 276570
assert state_dict['blocks.2.0.bn2.num_batches_tracked'].numpy() == 276570
def test_metadata(self):
metadata = {"hello": "world"}
safe_save({}, temp('metadata.safetensors'), metadata)
import struct
with open(temp('metadata.safetensors'), 'rb') as f:
dat = f.read()
sz = struct.unpack(">Q", dat[0:8])[0]
import json
assert json.loads(dat[8:8+sz])['__metadata__']['hello'] == 'world'
def helper_test_disk_tensor(fn, data, np_fxn, tinygrad_fxn=None):
if tinygrad_fxn is None: tinygrad_fxn = np_fxn
pathlib.Path(temp(fn)).unlink(missing_ok=True)
tinygrad_tensor = Tensor(data, device="CPU").to(f"disk:{temp(fn)}")
numpy_arr = np.array(data)
tinygrad_fxn(tinygrad_tensor)
np_fxn(numpy_arr)
np.testing.assert_allclose(tinygrad_tensor.numpy(), numpy_arr)
class TestDiskTensor(unittest.TestCase):
def test_empty(self):
pathlib.Path(temp("dt1")).unlink(missing_ok=True)
Tensor.empty(100, 100, device=f"disk:{temp('dt1')}")
def test_write_ones(self):
pathlib.Path(temp("dt2")).unlink(missing_ok=True)
out = Tensor.ones(10, 10, device="CPU")
outdisk = out.to(f"disk:{temp('dt2')}")
print(outdisk)
outdisk.realize()
del out, outdisk
# test file
with open(temp("dt2"), "rb") as f:
assert f.read() == b"\x00\x00\x80\x3F" * 100
# test load alt
reloaded = Tensor.empty(10, 10, device=f"disk:{temp('dt2')}")
out = reloaded.numpy()
assert np.all(out == 1.)
def test_assign_slice(self):
def assign(x,s,y): x[s] = y
helper_test_disk_tensor("dt3", [0,1,2,3], lambda x: assign(x, slice(0,2), [13, 12]))
helper_test_disk_tensor("dt4", [[0,1,2,3],[4,5,6,7]], lambda x: assign(x, slice(0,1), [[13, 12, 11, 10]]))
def test_reshape(self):
helper_test_disk_tensor("dt5", [1,2,3,4,5], lambda x: x.reshape((1,5)))
helper_test_disk_tensor("dt6", [1,2,3,4], lambda x: x.reshape((2,2)))
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python
import unittest
from tinygrad.ops import LazyOp, BinaryOps, ReduceOps, get_lazyop_info, BufferOps, MemBuffer
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.helpers import dtypes
class TestFlopCounter(unittest.TestCase):
def setUp(self):
self.buf0 = LazyOp(BufferOps.MEM, (), MemBuffer(1, dtypes.float32, ShapeTracker.from_shape((4,))))
self.buf1 = LazyOp(BufferOps.MEM, (), MemBuffer(2, dtypes.float32, ShapeTracker.from_shape((4,))))
def test_flops_add(self):
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
info = get_lazyop_info(op0)
self.assertEqual(info.flops, 4)
def test_flops_add_twice(self):
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
op1 = LazyOp(BinaryOps.ADD, (op0,self.buf1,), None)
info = get_lazyop_info(op1)
self.assertEqual(info.flops, 8)
def test_flops_add_self(self):
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
op1 = LazyOp(BinaryOps.ADD, (op0,op0,), None)
info = get_lazyop_info(op1)
self.assertEqual(info.flops, 8)
def test_flops_add_roundabout_self(self):
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
op1 = LazyOp(BinaryOps.ADD, (op0,self.buf1,), None)
op2 = LazyOp(BinaryOps.ADD, (op0,op1,), None)
info = get_lazyop_info(op2)
self.assertEqual(info.flops, 12)
def test_flops_red(self):
op0 = LazyOp(BinaryOps.MUL, (self.buf0,self.buf1,), None)
op1 = LazyOp(ReduceOps.SUM, (op0,), (1,))
op2 = LazyOp(BinaryOps.ADD, (op1, op1,), None)
info = get_lazyop_info(op2)
self.assertEqual(info.flops, 9)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,142 @@
import unittest
import numpy as np
from tinygrad.helpers import Context, ContextVar, DType, dtypes, merge_dicts, strip_parens, prod
from tinygrad.shape.symbolic import Variable, NumNode
VARIABLE = ContextVar("VARIABLE", 0)
class TestContextVars(unittest.TestCase):
# Ensuring that the test does not modify variables outside the tests.
ctx = Context()
def setUp(self): TestContextVars.ctx.__enter__()
def tearDown(self): TestContextVars.ctx.__exit__()
def test_initial_value_is_set(self):
_TMP = ContextVar("_TMP", 5)
self.assertEqual(_TMP.value, 5)
def test_multiple_creation_ignored(self):
_TMP2 = ContextVar("_TMP2", 1)
_TMP2 = ContextVar("_TMP2", 2)
self.assertEqual(_TMP2.value, 1)
def test_new_var_inside_context(self):
# Creating a _new_ variable inside a context should not have any effect on its scope (?)
with Context(VARIABLE=1):
_TMP3 = ContextVar("_TMP3", 1)
_TMP3 = ContextVar("_TMP3", 2)
self.assertEqual(_TMP3.value, 1)
def test_value_accross_modules(self):
# Mocking module import by invoking the code but not in our globals().
exec('from tinygrad.helpers import ContextVar;C = ContextVar("C", 13)', {}) # pylint:disable=exec-used
# It should not matter that the first creation was in another module.
C = ContextVar("C", 0)
self.assertEqual(C.value, 13)
def test_assignment_across_modules(self):
B = ContextVar("B", 1)
# local assignment
B.value = 2
self.assertEqual(B.value, 2)
# Assignment in another module.
exec('from tinygrad.helpers import ContextVar;B = ContextVar("B", 0);B.value = 3;', {}) # pylint:disable=exec-used
# Assignment in another module should affect this one as well.
self.assertEqual(B.value, 3)
def test_context_assignment(self):
with Context(VARIABLE=1):
self.assertEqual(VARIABLE.value, 1)
self.assertEqual(VARIABLE.value, 0)
def test_unknown_param_to_context(self):
with self.assertRaises(KeyError):
with Context(SOMETHING_ELSE=1):
pass
def test_inside_context_assignment(self):
with Context(VARIABLE=4):
# What you can and cannot do inside a context.
# 1. This type of statement has no effect.
VARIABLE = ContextVar("VARIABLE", 0)
self.assertTrue(VARIABLE >= 4, "ContextVars inside contextmanager may not set a new value")
# 2. The call syntax however has a local effect.
VARIABLE.value = 13
self.assertTrue(VARIABLE.value == 13, "Call syntax however works inside a contextmanager.")
# Related to 2. above. Note that VARIABLE is back to 0 again as expected.
self.assertEqual(VARIABLE.value, 0)
def test_new_var_inside_context_other_module(self):
with Context(VARIABLE=1):
_NEW2 = ContextVar("_NEW2", 0)
_NEW2 = ContextVar("_NEW2", 1)
self.assertEqual(_NEW2.value, 0)
code = """\
from tinygrad.helpers import Context, ContextVar
with Context(VARIABLE=1):
_NEW3 = ContextVar("_NEW3", 0)"""
exec(code, {}) # pylint:disable=exec-used
# While _NEW3 was created in an outside scope it should still work the same as above.
_NEW3 = ContextVar("_NEW3", 1)
self.assertEqual(_NEW3.value, 0)
def test_nested_context(self):
with Context(VARIABLE=1):
with Context(VARIABLE=2):
with Context(VARIABLE=3):
self.assertEqual(VARIABLE.value, 3)
self.assertEqual(VARIABLE.value, 2)
self.assertEqual(VARIABLE.value, 1)
self.assertEqual(VARIABLE.value, 0)
def test_decorator(self):
@Context(VARIABLE=1, DEBUG=4)
def test():
self.assertEqual(VARIABLE.value, 1)
self.assertEqual(VARIABLE.value, 0)
test()
self.assertEqual(VARIABLE.value, 0)
def test_context_exit_reverts_updated_values(self):
D = ContextVar("D", 1)
D.value = 2
with Context(D=3):
...
assert D.value == 2, f"Expected D to be 2, but was {D.value}. Indicates that Context.__exit__ did not restore to the correct value."
class TestMergeDicts(unittest.TestCase):
def test_merge_dicts(self):
a = {"a": 1, "b": 2}
b = {"a": 1, "c": 3}
c = {}
d = {"a": 2, "b": 2}
assert merge_dicts([a, b]) == {"a": 1, "b": 2, "c": 3}
assert merge_dicts([a, c]) == a
assert merge_dicts([a, b, c]) == {"a": 1, "b": 2, "c": 3}
with self.assertRaises(AssertionError):
merge_dicts([a, d])
class TestDtypes(unittest.TestCase):
def test_dtypes_fields(self):
fields = dtypes.fields()
self.assertTrue(all(isinstance(value, DType) for value in fields.values()))
self.assertTrue(all(issubclass(value.np, np.generic) for value in fields.values() if value.np is not None))
class TestStripParens(unittest.TestCase):
def test_simple(self): self.assertEqual("1+2", strip_parens("(1+2)"))
def test_nested(self): self.assertEqual("1+(2+3)", strip_parens("(1+(2+3))"))
def test_casted_no_strip(self): self.assertEqual("(int)(1+2)", strip_parens("(int)(1+2)"))
class TestProd(unittest.TestCase):
def test_empty(self): self.assertEqual(1, prod(tuple()))
def test_ints(self): self.assertEqual(30, prod((2, 3, 5)))
def test_variable(self): self.assertEqual("(a*12)", prod((Variable("a", 1, 5), 3, 4)).render())
def test_variable_order(self): self.assertEqual("(a*12)", prod((3, 4, Variable("a", 1, 5))).render())
def test_num_nodes(self): self.assertEqual(NumNode(6), prod((NumNode(2), NumNode(3))))
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,663 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.helpers import prod, DEBUG
from tinygrad.shape.shapetracker import ShapeTracker, View, get_contraction
from tinygrad.shape.symbolic import Variable
from itertools import product
def shapetracker_getitem(st, val):
locals = {"idx": val, "valid": 1}
idx, valid = st.expr_node()
exec(f"valid={valid.render()};idx={idx.render()}", None, locals)
return locals["idx"] if locals["valid"] else -1
class CheckingShapeTracker:
def __init__(self, shape):
self.st = ShapeTracker.from_shape(shape)
self.t = np.arange(prod(shape), dtype=np.int32).reshape(shape)
@property
def shape(self):
return self.t.shape
def simplify(self):
self.st = self.st.simplify()
return self
def reshape(self, new_shape):
self.st = self.st.reshape(new_shape)
self.t = self.t.reshape(new_shape)
return self
def permute(self, axis):
self.st = self.st.permute(axis)
self.t = np.transpose(self.t, axis)
return self
def expand(self, new_shape):
self.st = self.st.expand(new_shape)
self.t = np.broadcast_to(self.t, new_shape)
return self
def flip(self, axis):
self.st = self.st.stride(tuple(-1 if i in axis else 1 for i in range(len(self.shape))))
self.t = np.flip(self.t, axis)
return self
def shrink(self, arg):
self.st = self.st.shrink(arg)
self.t = self.t[tuple([slice(x[0], x[1]) for x in arg])]
return self
def pad(self, arg):
self.st = self.st.pad(arg)
self.t = np.pad(self.t, arg, constant_values=-1)
return self
def stride(self, arg):
self.st = self.st.stride(arg)
self.t = self.t[tuple([slice(None, None, x) for x in arg])]
return self
def __getitem__(self, val):
return self.t.flatten()[val]
@property
def views(self): return self.st.views
@property
def contiguous(self): return self.st.contiguous
def assert_same(self):
x = [shapetracker_getitem(self.st, i) for i in range(prod(self.st.shape))]
y = [self[i] for i in range(prod(self.shape))]
idx, valid = self.st.expr_node()
if DEBUG >= 1: print(x, y, self.st.shape, self.shape, idx.render(), valid.render(), self.st)
assert self.st.shape == self.shape
assert x == y, f"mismatch shapetracker:{x} real:{y}"
class TestRealIssues(unittest.TestCase):
def test_reshape_doesnt_multiview(self):
self.st = ShapeTracker((View.create((256, 256, 2, 2, 2, 2, 2, 256, 8, 2), (0, 8, 0, 4, 0, 0, 2, 16384, 2048, 1), 0, None),))
self.st.reshape((128, 2, 256, 2, 2, 2, 2, 2, 256, 8, 2))
assert len(self.st.views) == 1
class TestRealDoesntSimplify(unittest.TestCase):
def tearDown(self):
st = self.st.real_strides()
print(st)
self.st = self.st.simplify()
assert len(self.st.views) != 1
assert None in st
def test_1(self):
self.st = ShapeTracker((
View.create((8, 3, 1, 2, 11, 1), (33, 11, 0, 0, 1, 0), 0, None),
View.create((8, 6, 11), (66, 11, 1), 0, None)))
assert self.st.real_strides() == (33, None, 1)
def test_2(self):
self.st = ShapeTracker((
View.create((2, 2, 4, 3, 3), (72, 9, 18, -3, -1), 8, None),
View.create((4, 4, 3, 3), (36, 9, 3, 1), 0, None)))
assert self.st.real_strides() == (None, 18, -3, -1)
class TestRealStrides(unittest.TestCase):
def test_1(self):
self.st = ShapeTracker((
View.create((2048,), (1,), 0, ((0, 512),)),
View.create((16, 32, 4), (128, 4, 1), 0, None)))
st = self.st.real_strides()
print(self.st, st)
assert st == (None, 4, 1)
class TestRealSimplifies(unittest.TestCase):
def tearDown(self):
st = self.st.real_strides()
self.st = self.st.simplify()
assert len(self.st.views) == 1
print(self.st.views[-1].strides, st)
assert self.st.views[-1].strides == st
def test_1(self):
self.st = ShapeTracker((
View.create((1, 3, 2, 11, 4, 28), (0, 308, 0, 28, 0, 1), 0, None),
View.create((1, 3, 2, 11, 26, 1, 1, 3), (0, 2464, 0, 112, 1, 0, 0, 29), 0, None)))
def test_2(self):
self.st = ShapeTracker((
View.create((8, 3, 3, 11, 2, 28), (924, 308, 0, 28, 0, 1), 0, None),
View.create((8, 1, 6, 10, 28, 3, 2, 1), (5544, 0, 0, 56, 1, 1848, 672, 0), 0, None)))
class TestIndexExpressions2d(unittest.TestCase):
def setUp(self):
shapes = [(30, 5), (15, 10), (15, 1), (5, 10), (5, 1)] # Make sure dim0 is a multiple of 5, one of the tests divides this dimension by 5
offsets = [0, 1, 15, 28, 10000]
self.sts = [ShapeTracker((View.create(base_shape, offset=offset),)) for base_shape in shapes for offset in offsets]
self.offset = [Variable.num(offset) for base_shape in shapes for offset in offsets]
self.shapes = [shape for shape in shapes for offset in offsets]
self.node_exprs = []
self.idxs_exprs = []
def tearDown(self):
for st, offset, shape, node_expr, idxs_expr in zip(self.sts, self.offset, self.shapes, self.node_exprs, self.idxs_exprs):
numel = prod(shape)
assert node_expr(self.default_idx(st.shape)) == st.expr_node()[0]
assert node_expr(self.default_idx(st.shape)) == st.expr_node(None)[0]
assert node_expr(self.default_idx(st.shape)) == st.expr_node('idx')[0]
self.check_bounds(node_expr(self.default_idx(st.shape)), offset, numel)
for idx in [(0, numel-1), (7, 203), (2, 5), (0, 0), (numel, numel), (0, numel), (0, numel+1), (numel+100, numel+100)]:
idx = Variable("idx", idx[0], idx[1])
assert node_expr(idx) == st.expr_node(idx)[0]
self.check_bounds(node_expr(idx), offset, numel)
assert idxs_expr(self.default_idxs(st.shape)) == st.expr_idxs()[0]
assert idxs_expr(self.default_idxs(st.shape)) == st.expr_idxs(None)[0]
self.check_bounds(idxs_expr(self.default_idxs(st.shape)), offset, numel)
idx0s = [(0,0), (0, min(1, st.shape[0]-1)), (0, st.shape[0]-1), (min(3, st.shape[0]-1), min(6, st.shape[0]-1)), (st.shape[0]-1, st.shape[0]-1)]
idx1s = [(0,0), (0, min(1, st.shape[1]-1)), (0, st.shape[1]-1), (min(3, st.shape[1]-1), min(6, st.shape[1]-1)), (st.shape[1]-1, st.shape[1]-1)]
idx2s = [(0,0), (0, min(1, st.shape[2]-1)), (0, st.shape[2]-1), (min(3, st.shape[2]-1), min(6, st.shape[2]-1)), (st.shape[2]-1, st.shape[2]-1)] if len(st.shape) == 3 else [None for _ in idx0s]
for idx0, idx1, idx2 in product(idx0s, idx1s, idx2s):
idxs = [Variable(f"idx{i}", idx[0], idx[1]) for i, idx in enumerate((idx0, idx1, idx2)) if idx is not None]
assert idxs_expr(idxs) == st.expr_idxs(idxs)[0]
self.check_bounds(idxs_expr(idxs), offset, numel)
def default_idx(self, shape):
return Variable("idx", 0, prod(shape)-1)
def default_idxs(self, shape):
return [Variable(f"idx{i}", 0, d-1) for i,d in enumerate(shape)]
def check_bounds(self, expr, offset, numel):
assert expr.min >= offset
assert expr.max <= offset + numel - 1
def test_noop(self):
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape) + offset)
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[1] + offset)
def test_permute(self):
new_st = []
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
st = st.permute((1, 0))
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0] + idxs[1]*base_shape[1] + offset)
new_st.append(st)
self.sts = new_st
def test_reshape(self):
new_st = []
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
st = st.reshape((base_shape[0], 1, base_shape[1]))
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape) + offset)
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset)
new_st.append(st)
self.sts = new_st
def test_reshape_expand(self):
new_st = []
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
st = st.reshape((base_shape[0], 1, base_shape[1]))
st = st.expand((base_shape[0], base_shape[1], base_shape[1]))
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx//(base_shape[1]*base_shape[1])%base_shape[0]*base_shape[1] + idx%base_shape[1] + offset)
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset)
new_st.append(st)
self.sts = new_st
def test_permute_reshape_1(self): # This tests multiple views
new_st = []
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
st = st.permute((1, 0))
st = st.reshape((base_shape[0]//5, 1, base_shape[1]*5))
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape)%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: (idxs[0]*(base_shape[1]*5)+idxs[2])%base_shape[0]*base_shape[1] + (idxs[0]*(base_shape[1]*5)+idxs[2])//base_shape[0] + offset)
new_st.append(st)
self.sts = new_st
def test_permute_reshape_2(self):
new_st = []
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
st = st.permute((1, 0))
st = st.reshape((1, base_shape[0]//5, base_shape[1]*5))
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape)%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: (idxs[1]*(base_shape[1]*5)+idxs[2])%base_shape[0]*base_shape[1] + (idxs[1]*(base_shape[1]*5)+idxs[2])//base_shape[0] + offset)
new_st.append(st)
self.sts = new_st
class TestSimplifyingShapeTracker(unittest.TestCase):
def setUp(self):
self.st = CheckingShapeTracker((1, 10))
def tearDown(self):
self.st.assert_same()
# multiview simplify
def test_expand_contract_simple(self):
self.st = self.st.expand((10, 10))
self.st = self.st.reshape((100,))
print(self.st.views)
assert(len(self.st.views) == 2)
self.st = self.st.reshape((10, 10))
print(self.st.views)
self.st = self.st.simplify()
print(self.st.views)
assert(len(self.st.views) == 1)
# multiview simplify
def test_expand_contract_different_shape(self):
self.st.expand((10, 10))
self.st.reshape((100,))
print(self.st.views)
assert(len(self.st.views) == 2)
self.st.reshape((2, 5, 2, 5))
print(self.st.views)
self.st = self.st.simplify()
print(self.st.views)
assert(len(self.st.views) == 1)
# multiview simplify
def test_expand_contract_still_complex(self):
self.st.expand((10, 10))
self.st.reshape((100,))
print(self.st.views)
assert(len(self.st.views) == 2)
self.st.reshape((5, 20))
self.st = self.st.simplify()
print(self.st.views)
assert(len(self.st.views) == 2)
# Tensor.zeros(2, 4).permute(1,0).reshape(2, 4)
# (d1*4 + d0%4), d1=x//4, d0=x%4 = ((x//4)*4) + (x%4)%4
class TestComplexShapeTracker(unittest.TestCase):
def test_add_1s(self):
self.st = CheckingShapeTracker((4, 4))
self.st.permute((1,0))
self.st.reshape((1,4,1,4,1))
assert not self.st.contiguous
self.st.permute((0,3,2,1,4))
assert self.st.contiguous
def test_permute_1s_simple(self):
self.st = CheckingShapeTracker((1, 16, 9,9))
self.st.permute((1,0,2,3))
assert self.st.contiguous
self.st = CheckingShapeTracker((2, 16, 9,9))
self.st.permute((1,0,2,3))
assert not self.st.contiguous
def test_remove_1s_simple(self):
self.st = CheckingShapeTracker((1, 16, 1, 1))
self.st.reshape((16,))
assert self.st.contiguous
def test_remove_1s(self):
self.st = CheckingShapeTracker((1, 4, 1, 4, 1))
self.st.permute((0,3,2,1,4))
self.st.reshape((4,4))
assert not self.st.contiguous
self.st.permute((1,0))
assert self.st.contiguous
def test_permute_reshape(self):
self.st = CheckingShapeTracker((4, 4))
self.st.permute((1,0))
self.st.reshape((2, 2, 2, 2))
# TODO: should also be tested by test_super_complex
assert len(self.st.views) == 1
def test_factorize_split(self):
self.st = CheckingShapeTracker((4, 4))
self.st.permute((1,0))
self.st.reshape((2, 2, 2, 2))
self.st.permute((2,3,0,1))
assert self.st.contiguous
def test_factorize_combine(self):
self.st = CheckingShapeTracker((4, 4, 4))
self.st.permute((2, 0, 1))
self.st.reshape((4, 16))
self.st.permute((1, 0))
assert self.st.contiguous
def test_factorize_combine_add_ones(self):
self.st = CheckingShapeTracker((4, 4, 4))
self.st.permute((2, 0, 1))
self.st.reshape((4, 16, 1, 1))
self.st.permute((1, 0, 2, 3))
assert self.st.contiguous
def test_fancy_factorize(self):
self.st = CheckingShapeTracker((32, 3, 3, 1))
self.st.reshape((8, 4, 3, 3))
assert len(self.st.views) == 1
def test_super_complex_2_fail(self):
self.st = CheckingShapeTracker((4, 4, 4))
self.st.permute((2, 0, 1))
self.st.reshape((16, 4))
assert len(self.st.views) != 1
def test_work(self):
self.st = CheckingShapeTracker((64, 1024, 4))
self.st.reshape((1, 64, 128, 32))
self.st.permute((0, 3, 1, 2))
self.st.reshape((1, 32, 1, 64, 128))
self.st.permute((0, 3, 4, 1, 2))
assert self.st.contiguous
def test_work2(self):
self.st = CheckingShapeTracker((64, 1024, 4))
self.st.reshape((1, 64, 128, 32))
self.st.permute((0, 3, 1, 2))
self.st.reshape((1, 1, 32, 64, 128))
self.st.permute((0, 3, 4, 1, 2))
self.st.reshape((64, 1024, 4))
print(self.st.views)
assert self.st.contiguous
class TestSingleShapeTracker(unittest.TestCase):
def setUp(self):
self.st = CheckingShapeTracker((7,4))
def tearDown(self):
self.st.assert_same()
def test_reshape(self):
self.st.reshape((7,1,4))
assert self.st.contiguous
def test_permute(self):
self.st.permute((1,0))
assert not self.st.contiguous
def test_shrink(self):
self.st.shrink(((1,2), (0,4)))
assert not self.st.contiguous
def test_double_permute(self):
self.st.permute((1,0))
self.st.permute((1,0))
assert self.st.contiguous
def test_reshape_permute(self):
self.st.reshape((7,1,4))
self.st.permute((0,1,2))
assert self.st.contiguous
def test_reshape_permute_yes(self):
self.st.reshape((7,1,4))
self.st.permute((0,2,1))
assert self.st.contiguous
def test_reshape_permute_no(self):
self.st.reshape((4,7))
self.st.permute((1,0))
assert not self.st.contiguous
class TestShapeTrackerFuzzFailures(unittest.TestCase):
def setUp(self):
self.st = CheckingShapeTracker((3,3,3))
def tearDown(self):
self.st.assert_same()
@unittest.skip("simplify doesn't work in this case")
def test_case_1(self):
self.st.shrink(((1, 2), (1, 3), (1, 3)))
self.st.reshape((1, 4))
self.st.shrink(((0, 1), (1, 3)))
print(self.st.st)
self.st = self.st.simplify()
print(self.st.st)
def test_case_2(self):
self.st.stride( (1, 1, -2) )
self.st.reshape( (3, 6) )
self.st.shrink( ((1, 2), (1, 5)) )
self.st.stride( (1, -1) )
def test_case_3(self):
self.st.shrink( ((0, 2), (0, 2), (0, 1)) )
self.st.permute( (1, 0, 2) )
self.st.reshape( (4,) )
self.st.shrink( ((0, 3),) )
self.st.stride( (-1,) )
def test_case_4(self):
self.st.reshape( (3, 3, 3, 1) )
self.st.pad( ((0, 0), (0, 0), (0, 0), (1, 1)) )
self.st.shrink( ((0, 2), (1, 2), (0, 2), (0, 1)) )
self.st.expand( (2, 1, 2, 3) )
class TestMaskedShapeTracker(unittest.TestCase):
def test_pad_1x1(self):
self.st = CheckingShapeTracker((1,1))
self.st.pad(((1,1), (1,1)))
self.st.assert_same()
def test_pad_2x2(self):
self.st = CheckingShapeTracker((2,2))
self.st.pad(((1,1), (1,1)))
self.st.assert_same()
class TestShapeTracker(unittest.TestCase):
def setUp(self):
self.st = CheckingShapeTracker((7,4))
self.apply = lambda fxn: [fxn(x) for x in [self.st]]
def tearDown(self):
self.st.assert_same()
def test_noop(self):
pass
def test_simple_split(self):
self.test_permute()
self.apply(lambda x: x.reshape((prod(self.st.shape), )))
def test_simple_pad(self):
self.st.pad(((1,1), (1,1)))
def test_pad_shrink(self):
self.st.pad(((1,1), (1,1)))
self.st.shrink(((0,4), (0,4)))
def test_pad_one_sided(self):
self.st.pad(((0,1), (0,0)))
def test_pad_reshape(self):
self.st.pad(((0,1), (0,0)))
self.st.reshape((8*4,))
def test_pad_pad(self):
self.st.pad(((1,1), (1,1)))
self.st.pad(((1,1), (1,1)))
def test_pad_permute(self):
self.st.pad(((1,1), (2,2)))
self.st.permute((1,0))
def test_pad_expand(self):
self.st.reshape((7,4,1))
self.st.pad(((1,1), (1,1), (0,0)))
self.st.expand((9,6,4))
def test_pad_expand_alt(self):
self.st.pad(((1,1), (1,1)))
self.st.reshape((9,6,1))
self.st.expand((9,6,4))
def test_pad_stride(self):
self.st.pad(((1,4), (1,3)))
self.st.stride((2,2))
def test_pad_stride_neg(self):
self.st.pad(((1,2), (1,0)))
self.st.stride((-1,-1))
def test_pad_stride_both(self):
self.st.pad(((1,2), (1,0)))
self.st.stride((-2,-2))
def test_shrink_pad(self):
self.st.shrink(((0,4), (0,4)))
self.st.pad(((1,1), (1,1)))
def test_reshape(self):
new_shape = self.st.shape[::-1]
self.apply(lambda x: x.reshape(new_shape))
def test_permute(self):
if len(self.st.shape) == 2: self.apply(lambda x: x.permute((1,0)))
elif len(self.st.shape) == 3: self.apply(lambda x: x.permute((2,0,1)))
def test_reshape_with_1(self):
new_shape = (self.st.shape[0], 1, self.st.shape[1])
self.apply(lambda x: x.reshape(new_shape))
def test_expand(self):
self.test_reshape_with_1()
new_shape = list(self.st.shape)
new_shape[1] = 2
self.apply(lambda x: x.expand(tuple(new_shape)))
def test_flip_0(self):
self.apply(lambda x: x.flip((0,)))
def test_flip_1(self):
self.apply(lambda x: x.flip((1,)))
def test_flip_01(self):
self.apply(lambda x: x.flip((0,1)))
def test_slice_0(self):
self.apply(lambda x: x.shrink(((1, x.shape[0]), (0, x.shape[1]))))
def test_slice_1(self):
self.apply(lambda x: x.shrink(((0, x.shape[0]), (1, x.shape[1]))))
def test_slice_1c1(self):
self.apply(lambda x: x.shrink(((0, 1), (0, 1))))
def test_slice_1c2(self):
self.apply(lambda x: x.shrink(((1, 2), (1, 2))))
def test_double_permute(self):
self.apply(lambda x: x.permute((1, 0)))
self.apply(lambda x: x.permute((1, 0)))
def test_slice_permute(self):
self.apply(lambda x: x.shrink(((0, 2), (2, 4))))
self.apply(lambda x: x.permute((1, 0)))
def test_slice_expand(self):
self.apply(lambda x: x.shrink(((0, 2), (3, 4))))
self.apply(lambda x: x.expand((2, 10)))
def test_double_stride(self):
self.apply(lambda x: x.stride((1, 2)))
self.apply(lambda x: x.stride((2, 1)))
def test_stride(self): self.apply(lambda x: x.stride((2,1)))
def test_stride_int(self): self.apply(lambda x: x.stride((1,2)))
def test_stride_2(self): self.apply(lambda x: x.stride((2,2)))
def test_stride_n(self): self.apply(lambda x: x.stride((-2,1)))
def test_stride_int_n(self): self.apply(lambda x: x.stride((-1,2)))
def test_stride_2_n(self): self.apply(lambda x: x.stride((-2,-2)))
def test_reshape_then_permute(self):
self.test_reshape()
self.test_permute()
def test_reshape_then_expand(self):
self.test_reshape()
self.test_expand()
def test_permute_then_reshape(self):
self.test_permute()
self.test_reshape()
def test_expand_then_reshape(self):
self.test_expand()
self.test_reshape()
def test_combo(self):
self.test_permute()
self.test_reshape()
self.test_slice_1()
self.test_expand()
self.test_permute()
class TestGetContraction(unittest.TestCase):
def test_contraction(self):
r = get_contraction((1,2,3,4), (2,3,4))
self.assertEqual(r, [[0, 1], [2], [3]])
r = get_contraction((2,1,3,4), (2,3,4))
self.assertEqual(r, [[0], [1, 2], [3]])
r = get_contraction((1,2,3,1,4), (1,2,3,4))
self.assertEqual(r, [[0], [1], [2], [3, 4]])
r = get_contraction((1,2,3,1,4,1,1), (2,3,4))
self.assertEqual(r, [[0, 1], [2], [3, 4, 5, 6]])
r = get_contraction((1,2,3,4), (1,2,3*4))
self.assertEqual(r, [[0], [1], [2, 3]])
r = get_contraction((1,2,3,4), (2,1,3,4))
self.assertEqual(r, [[0, 1], [], [2], [3]])
r = get_contraction((1,2,3,4), (1,1,2*3*4,1))
self.assertEqual(r, [[0], [], [1,2,3], []])
r = get_contraction((2,1,3,4), (1,2,3,4))
self.assertEqual(r, [[], [0], [1, 2], [3]])
r = get_contraction((1,2,3,4), (2*3*4,1,1,1))
self.assertEqual(r, [[0, 1, 2, 3], [], [], []])
r = get_contraction((4,4,4,4), (16,1,16))
self.assertEqual(r, [[0, 1], [], [2, 3]])
r = get_contraction((1,2,3,4,1,1,1), (2,3,4))
self.assertEqual(r, [[0, 1], [2], [3, 4, 5, 6]])
r = get_contraction((1,2,3,4), (1,2,3,4,1))
self.assertEqual(r, [[0], [1], [2], [3], []])
r = get_contraction((14,1,384,14,1,1,1,1), (1,14,384,14))
self.assertEqual(r, [[], [0], [1,2], [3,4,5,6,7]])
r = get_contraction((14,1,384,1,14,1,1,1,1), (1,14,384,14))
self.assertEqual(r, [[], [0], [1,2], [3,4,5,6,7,8]])
r = get_contraction((512, 512), (1, 1, 512, 1, 1, 1, 1, 512))
self.assertEqual(r, [[], [], [0], [], [], [], [], [1]])
r = get_contraction((1,2,3,4), (1,2,6,2))
self.assertEqual(r, None)
def test_contraction_ones(self):
r = get_contraction((1,), (1,1,1))
self.assertEqual(r, [[0], [], []])
r = get_contraction((1,1), (1,1,1))
self.assertEqual(r, [[0], [1], []])
r = get_contraction((1,1,1,1), (1,))
self.assertEqual(r, [[0,1,2,3]])
r = get_contraction((1,1,1,1), (1,1))
self.assertEqual(r, [[0], [1,2,3]])
r = get_contraction((1,1,1,1), (1,1,1))
self.assertEqual(r, [[0], [1], [2,3]])
r = get_contraction((1,1,1,1), (1,1,1,1))
self.assertEqual(r, [[0], [1], [2], [3]])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,39 @@
import unittest
import multiprocessing.shared_memory as shared_memory
from tinygrad.helpers import CI
from tinygrad.runtime.ops_shm import RawShmBuffer
from tinygrad.tensor import Tensor, Device
import numpy as np
class TestRawShmBuffer(unittest.TestCase):
def test_e2e(self):
t = Tensor.randn(2, 2, 2).realize()
# copy to shm
shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name
s.close()
t_shm = t.to(f"shm:{shm_name}").realize()
# copy from shm
t2 = t_shm.to(Device.DEFAULT).realize()
assert np.allclose(t.numpy(), t2.numpy())
s.unlink()
@unittest.skipIf(CI, "CI doesn't like big shared memory")
def test_e2e_big(self):
t = Tensor.randn(2048, 2048, 8).realize()
# copy to shm
shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name
s.close()
t_shm = t.to(f"shm:{shm_name}").realize()
# copy from shm
t2 = t_shm.to(Device.DEFAULT).realize()
assert np.allclose(t.numpy(), t2.numpy())
s.unlink()
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,448 @@
#!/usr/bin/env python
import unittest
from tinygrad.shape.symbolic import Node, MulNode, SumNode, Variable, NumNode, LtNode, sym_render, sym_infer, create_rednode
class TestSymbolic(unittest.TestCase):
def helper_test_variable(self, v, n, m, s):
self.assertEqual(v.render(), s)
self.assertEqual(v.min, n)
self.assertEqual(v.max, m)
def test_ge(self):
self.helper_test_variable(Variable("a", 3, 8)>=77, 0, 0, "0")
self.helper_test_variable(Variable("a", 3, 8)>=9, 0, 0, "0")
self.helper_test_variable(Variable("a", 3, 8)>=8, 0, 1, "((a*-1)<-7)")
self.helper_test_variable(Variable("a", 3, 8)>=4, 0, 1, "((a*-1)<-3)")
self.helper_test_variable(Variable("a", 3, 8)>=3, 1, 1, "1")
self.helper_test_variable(Variable("a", 3, 8)>=2, 1, 1, "1")
def test_lt(self):
self.helper_test_variable(Variable("a", 3, 8)<77, 1, 1, "1")
self.helper_test_variable(Variable("a", 3, 8)<9, 1, 1, "1")
self.helper_test_variable(Variable("a", 3, 8)<8, 0, 1, "(a<8)")
self.helper_test_variable(Variable("a", 3, 8)<4, 0, 1, "(a<4)")
self.helper_test_variable(Variable("a", 3, 8)<3, 0, 0, "0")
self.helper_test_variable(Variable("a", 3, 8)<2, 0, 0, "0")
def test_ge_divides(self):
expr = (Variable("idx", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512
self.helper_test_variable(expr, 0, 1, "(idx<128)")
def test_ge_divides_and(self):
expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512,
(Variable("idx2", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512])
self.helper_test_variable(expr, 0, 1, "((idx1<128) and (idx2<128))")
expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512,
(Variable("idx2", 0, 511)*4 + Variable("FLOAT8_INDEX", 0, 7)) < 512])
self.helper_test_variable(expr//4, 0, 1, "((((FLOAT8_INDEX//4)+idx2)<128) and ((idx1//4)<32))")
def test_lt_factors(self):
expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 256)) < 512])
self.helper_test_variable(expr, 0, 1, "(((idx1*4)+FLOAT4_INDEX)<512)")
def test_div_becomes_num(self):
assert isinstance(Variable("a", 2, 3)//2, NumNode)
def test_var_becomes_num(self):
assert isinstance(Variable("a", 2, 2), NumNode)
def test_equality(self):
idx1 = Variable("idx1", 0, 3)
idx2 = Variable("idx2", 0, 3)
assert idx1 == idx1
assert idx1 != idx2
assert idx1*4 == idx1*4
assert idx1*4 != idx1*3
assert idx1*4 != idx1+4
assert idx1*4 != idx2*4
assert idx1+idx2 == idx1+idx2
assert idx1+idx2 == idx2+idx1
assert idx1+idx2 != idx2
def test_factorize(self):
a = Variable("a", 0, 8)
self.helper_test_variable(a*2+a*3, 0, 8*5, "(a*5)")
def test_factorize_no_mul(self):
a = Variable("a", 0, 8)
self.helper_test_variable(a+a*3, 0, 8*4, "(a*4)")
def test_neg(self):
self.helper_test_variable(-Variable("a", 0, 8), -8, 0, "(a*-1)")
def test_add_1(self):
self.helper_test_variable(Variable("a", 0, 8)+1, 1, 9, "(1+a)")
def test_add_num_1(self):
self.helper_test_variable(Variable("a", 0, 8)+Variable.num(1), 1, 9, "(1+a)")
def test_sub_1(self):
self.helper_test_variable(Variable("a", 0, 8)-1, -1, 7, "(-1+a)")
def test_sub_num_1(self):
self.helper_test_variable(Variable("a", 0, 8)-Variable.num(1), -1, 7, "(-1+a)")
def test_mul_0(self):
self.helper_test_variable(Variable("a", 0, 8)*0, 0, 0, "0")
def test_mul_1(self):
self.helper_test_variable(Variable("a", 0, 8)*1, 0, 8, "a")
def test_mul_neg_1(self):
self.helper_test_variable((Variable("a", 0, 2)*-1)//3, -1, 0, "((((a*-1)+3)//3)+-1)")
def test_mul_2(self):
self.helper_test_variable(Variable("a", 0, 8)*2, 0, 16, "(a*2)")
def test_div_1(self):
self.helper_test_variable(Variable("a", 0, 8)//1, 0, 8, "a")
def test_mod_1(self):
self.helper_test_variable(Variable("a", 0, 8)%1, 0, 0, "0")
def test_add_min_max(self):
self.helper_test_variable(Variable("a", 0, 8) * 2 + 12, 12, 16+12, "((a*2)+12)")
def test_div_min_max(self):
self.helper_test_variable(Variable("a", 0, 7) // 2, 0, 3, "(a//2)")
def test_div_neg_min_max(self):
self.helper_test_variable(Variable("a", 0, 7) // -2, -3, 0, "((a//2)*-1)")
def test_sum_div_min_max(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7), Variable("b", 0, 3)]) // 2, 0, 5, "((a+b)//2)")
def test_sum_div_factor(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable("b", 0, 3)*4]) // 2, 0, 20, "((a*2)+(b*2))")
def test_sum_div_some_factor(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*5, Variable("b", 0, 3)*4]) // 2, 0, 23, "(((a*5)//2)+(b*2))")
def test_sum_div_some_partial_factor(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*6, Variable("b", 0, 7)*6]) // 16, 0, 5, "(((a*3)+(b*3))//8)")
self.helper_test_variable(Variable.sum([Variable.num(16), Variable("a", 0, 7)*6, Variable("b", 0, 7)*6]) // 16, 1, 6, "((((a*3)+(b*3))//8)+1)")
def test_sum_div_no_factor(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*5, Variable("b", 0, 3)*5]) // 2, 0, 25, "(((a*5)+(b*5))//2)")
def test_mod_factor(self):
# NOTE: even though the mod max is 50, it can't know this without knowing about the mul
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*100, Variable("b", 0, 3)*50]) % 100, 0, 99, "((b*50)%100)")
def test_mod_to_sub(self):
# This is mod reduction
self.helper_test_variable((1+Variable("a",1,2))%2, 0, 1, (Variable("a",1,2)-1).render())
def test_sum_div_const(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable.num(3)]) // 4, 0, 7, "a")
def test_sum_div_const_big(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable.num(3)]) // 16, 0, 1, "(a//4)")
def test_sum_lt_fold(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7) * 4, Variable("b", 0, 3)]) < 16, 0, 1, "(a<4)")
self.helper_test_variable(Variable.sum([Variable("a", 0, 7) * 4, Variable("b", 0, 4)]) < 16, 0, 1, "(((a*4)+b)<16)")
def test_mod_mul(self):
self.helper_test_variable((Variable("a", 0, 5)*10)%9, 0, 5, "a")
def test_mod_mod(self):
self.helper_test_variable((Variable("a", 0, 31)%12)%4, 0, 3, "(a%4)")
self.helper_test_variable(((4*Variable("a", 0, 31)) % 12) % 4, 0, 0, "0")
self.helper_test_variable((Variable("a", 0, 31) % 4) % 12, 0, 3, "(a%4)")
def test_mul_mul(self):
self.helper_test_variable((Variable("a", 0, 5)*10)*9, 0, 5*10*9, "(a*90)")
def test_mul_lt(self):
self.helper_test_variable((Variable("a", 0, 5)*4)<13, 0, 1, "(a<4)")
self.helper_test_variable((Variable("a", 0, 5)*4)<16, 0, 1, "(a<4)")
self.helper_test_variable((Variable("a", 0, 5)*4)>11, 0, 1, "((a*-1)<-2)")
self.helper_test_variable((Variable("a", 0, 5)*4)>12, 0, 1, "((a*-1)<-3)")
def test_div_div(self):
self.helper_test_variable((Variable("a", 0, 1800)//10)//9, 0, 20, "(a//90)")
def test_distribute_mul(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 3), Variable("b", 0, 5)])*3, 0, 24, "((a*3)+(b*3))")
def test_mod_mul_sum(self):
self.helper_test_variable(Variable.sum([Variable("b", 0, 2), Variable("a", 0, 5)*10])%9, 0, 7, "(a+b)")
def test_sum_0(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)]), 0, 7, "a")
def test_mod_remove(self):
self.helper_test_variable(Variable("a", 0, 6)%100, 0, 6, "a")
def test_big_mod(self):
# NOTE: we no longer support negative variables
#self.helper_test_variable(Variable("a", -20, 20)%10, -9, 9, "(a%10)")
#self.helper_test_variable(Variable("a", -20, 0)%10, -9, 0, "(a%10)")
#self.helper_test_variable(Variable("a", -20, 1)%10, -9, 1, "(a%10)")
self.helper_test_variable(Variable("a", 0, 20)%10, 0, 9, "(a%10)")
#self.helper_test_variable(Variable("a", -1, 20)%10, -1, 9, "(a%10)")
def test_gt_remove(self):
self.helper_test_variable(Variable("a", 0, 6) >= 25, 0, 0, "0")
def test_lt_remove(self):
self.helper_test_variable(Variable("a", 0, 6) < -3, 0, 0, "0")
self.helper_test_variable(Variable("a", 0, 6) < 3, 0, 1, "(a<3)")
self.helper_test_variable(Variable("a", 0, 6) < 8, 1, 1, "1")
def test_lt_sum_remove(self):
self.helper_test_variable((Variable("a", 0, 6) + 2) < 3, 0, 1, "(a<1)")
def test_and_fold(self):
self.helper_test_variable(Variable.ands([Variable.num(0), Variable("a", 0, 1)]), 0, 0, "0")
def test_and_remove(self):
self.helper_test_variable(Variable.ands([Variable.num(1), Variable("a", 0, 1)]), 0, 1, "a")
def test_mod_factor_negative(self):
self.helper_test_variable(Variable.sum([Variable.num(-29), Variable("a", 0, 10), Variable("b", 0, 10)*28]) % 28, 0, 27, "((27+a)%28)")
self.helper_test_variable(Variable.sum([Variable.num(-29), Variable("a", 0, 100), Variable("b", 0, 10)*28]) % 28, 0, 27, "((27+a)%28)")
def test_sum_combine_num(self):
self.helper_test_variable(Variable.sum([Variable.num(29), Variable("a", 0, 10), Variable.num(-23)]), 6, 16, "(6+a)")
def test_sum_num_hoisted_and_factors_cancel_out(self):
self.helper_test_variable(Variable.sum([Variable("a", 0, 1) * -4 + 1, Variable("a", 0, 1) * 4]), 1, 1, "1")
def test_div_factor(self):
self.helper_test_variable(Variable.sum([Variable.num(-40), Variable("a", 0, 10)*2, Variable("b", 0, 10)*40]) // 40, -1, 9, "(-1+b)")
def test_mul_div(self):
self.helper_test_variable((Variable("a", 0, 10)*4)//4, 0, 10, "a")
def test_mul_div_factor_mul(self):
self.helper_test_variable((Variable("a", 0, 10)*8)//4, 0, 20, "(a*2)")
def test_mul_div_factor_div(self):
self.helper_test_variable((Variable("a", 0, 10)*4)//8, 0, 5, "(a//2)")
def test_div_remove(self):
self.helper_test_variable(Variable.sum([Variable("idx0", 0, 127)*4, Variable("idx2", 0, 3)])//4, 0, 127, "idx0")
def test_div_numerator_negative(self):
self.helper_test_variable((Variable("idx", 0, 9)*-10)//11, -9, 0, "((((idx*-10)+99)//11)+-9)")
def test_div_into_mod(self):
self.helper_test_variable((Variable("idx", 0, 16)*4)%8//4, 0, 1, "(idx%2)")
class TestSymbolicNumeric(unittest.TestCase):
def helper_test_numeric(self, f):
# TODO: why are the negative tests broken? (even if we did support negative variables)
#MIN, MAX = -10, 10
MIN, MAX = 0, 10
# one number
for i in range(MIN, MAX):
v = f(Variable.num(i))
#print(i, f(i), v.min, v.max)
self.assertEqual(v.min, v.max)
self.assertEqual(v.min, f(i))
for kmin in range(MIN, MAX):
for kmax in range(MIN, MAX):
if kmin > kmax: continue
v = f(Variable("tmp", kmin, kmax))
values = [f(rv) for rv in range(kmin, kmax+1)]
# the min and max may not be exact
self.assertLessEqual(v.min, min(values))
self.assertGreaterEqual(v.max, max(values))
def test_mod_4(self): self.helper_test_numeric(lambda x: (x%4))
def test_div_4(self): self.helper_test_numeric(lambda x: (x//4))
def test_plus_1_div_2(self): self.helper_test_numeric(lambda x: (x+1)//2)
def test_plus_1_mod_2(self): self.helper_test_numeric(lambda x: (x+1)%2)
def test_times_2(self): self.helper_test_numeric(lambda x: x*2)
def test_times_2_plus_3(self): self.helper_test_numeric(lambda x: x*2 + 3)
def test_times_2_plus_3_mod_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)%4)
def test_times_2_plus_3_div_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)//4)
def test_times_2_plus_3_div_4_mod_4(self): self.helper_test_numeric(lambda x: ((x*2 + 3)//4)%4)
class TestSymbolicVars(unittest.TestCase):
def test_simple(self):
z = NumNode(0)
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)
c = Variable("c", 0, 10)
assert z.vars() == z.vars() == []
assert a.vars() == a.vars() == [a]
m = MulNode(a, 3)
assert m.vars() == [a]
s = SumNode([a, b, c])
assert s.vars() == [a, b, c]
def test_compound(self):
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)
c = Variable("c", 0, 10)
assert (a + b * c).vars() == [a, b, c]
assert (a % 3 + b // 5).vars() == [a, b]
assert (a + b + c - a).vars() == [b, c]
class TestSymbolicMinMax(unittest.TestCase):
def test_min_max_known(self):
a = Variable("a", 1, 8)
assert max(1, a) == max(a, 1) == a
assert min(1, a) == min(a, 1) == 1
class TestSymRender(unittest.TestCase):
def test_sym_render(self):
a = Variable("a", 1, 8)
b = Variable("b", 1, 10)
assert sym_render(a) == "a"
assert sym_render(1) == "1"
assert sym_render(a+1) == "(1+a)"
assert sym_render(a*b) == "(a*b)"
class TestSymInfer(unittest.TestCase):
def test_sym_infer(self):
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)
c = Variable("c", 0, 10)
var_vals = {a: 2, b: 3, c: 4}
assert sym_infer(5, var_vals) == 5
assert sym_infer(a, var_vals) == 2
assert sym_infer(b, var_vals) == 3
assert sym_infer(a+b, var_vals) == 5
assert sym_infer(a-b, var_vals) == -1
assert sym_infer(a+b+c, var_vals) == 9
assert sym_infer(a*b, var_vals) == 6
assert sym_infer(a*b+c, var_vals) == 10
class TestSymbolicSymbolicOps(unittest.TestCase):
def test_node_divmod_node(self):
i = Variable("i", 1, 10)
idx0 = Variable("idx0", 0, i*3-1)
assert NumNode(0) // (Variable("i", 1, 10)*128) == 0
assert NumNode(0) % (Variable("i", 1, 10)*128) == 0
assert NumNode(127) // (Variable("i", 1, 10)*128) == 0
assert NumNode(127) % (Variable("i", 1, 10)*128) == 127
assert 127 // (Variable("i", 1, 10)*128) == 0
assert 127 % (Variable("i", 1, 10)*128) == 127
assert NumNode(128) // (Variable("i", 1, 10)*128 + 128) == 0
assert NumNode(128) % (Variable("i", 1, 10)*128 + 128) == 128
assert 128 // (Variable("i", 1, 10)*128 + 128) == 0
assert 128 % (Variable("i", 1, 10)*128 + 128) == 128
assert 0 // (Variable("i", 1, 10)*128) == 0
assert 0 % (Variable("i", 1, 10)*128) == 0
assert idx0 // (i*3) == 0
assert idx0 % (i*3) == idx0
assert i // i == 1
assert i % i == 0
assert 128 // NumNode(4) == 32
assert 128 % NumNode(4) == 0
assert NumNode(128) // NumNode(4) == 32
assert NumNode(128) % NumNode(4) == 0
def test_mulnode_divmod_node(self):
i = Variable("i", 1, 10)
idx0 = Variable("idx0", 0, 31)
assert (idx0*(i*4+4)) // (i+1) == (idx0*4)
assert (idx0*(i*4+4)) % (i+1) == 0
assert (idx0*i) % i == 0
def test_sumnode_divmod_sumnode(self):
i = Variable("i", 1, 10)
idx0 = Variable("idx0", 0, 7)
idx1 = Variable("idx1", 0, 3)
idx2 = Variable("idx2", 0, i)
assert (idx0*(i*4+4)+idx1*(i+1)+idx2) // (i+1) == idx0*4+idx1
assert (idx0*(i*4+4)+idx1*(i+1)+idx2) % (i+1) == idx2
assert (i+1) // (i*128+128) == 0
assert (i+1) % (i*128+128) == (i+1)
assert (i+1+idx2) // (i+1) == 1
assert (i+1+idx2) % (i+1) == idx2
assert (idx0*(i*4+4)+i+1+idx2) // (i+1) == idx0*4+1
assert (idx0*(i*4+4)+i+1+idx2) % (i+1) == idx2
assert (i*128+128)*2 // (i*128+128) == 2
assert (i*128+128)*2 % (i*128+128) == 0
def test_sumnode_divmod_sumnode_complex(self):
i = Variable("i", 1, 1024)
gidx0 = Variable("gidx0", 0, i)
lidx1 = Variable("lidx1", 0, 7)
ridx2 = Variable("ridx1", 0, 31)
assert ((i*128+128)*2 + gidx0*128 + lidx1*(i*512+512) + ridx2*4) // (i*128+128) == 2 + lidx1*4
assert ((i*128+128)*2 + gidx0*128 + lidx1*(i*512+512) + ridx2*4) % (i*128+128) == gidx0*128 + ridx2*4
assert ((gidx0*128+i*128+ridx2*4+129)) // (i*128+128) == 1
assert ((gidx0*128+i*128+ridx2*4+129)) % (i*128+128) == gidx0*128 + ridx2*4 + 1
assert (ridx2*(i*4+4)+1+i+gidx0) // (i*128+128) == 0
assert (ridx2*(i*4+4)+1+i+gidx0) % (i*128+128) == (ridx2*(i*4+4)+1+i+gidx0)
def test_node_lt_node(self):
a = Variable("a", 1, 5)
b = Variable("b", 6, 9)
c = Variable("c", 1, 10)
d = Variable("d", 5, 10)
# if the value is always the same, it folds to num
assert (a < b) == 1
assert (b < a) == 0
assert (d < a) == 0
# if it remains as a LtNode, bool is always true and (min, max) == (0, 1)
assert isinstance((a < c), LtNode) and (a < c).min == 0 and (a < c).max == 1
assert a < c
assert isinstance((a > c), LtNode) and (a > c).min == 0 and (a > c).max == 1
# same when comparing with a constant
assert a < 3 and (a < 3).min == 0 and (a < 3).max == 1
assert a > 3 and (a > 3).min == 0 and (a > 3).max == 1
def test_num_node_mul_node(self):
a = Variable("a", 1, 5)
b = NumNode(2) * a
assert b == a * 2
assert isinstance(b, MulNode)
b = NumNode(1) * a
assert b == a
assert isinstance(b, Variable)
b = NumNode(0) * a
assert b == 0
assert isinstance(b, NumNode)
def test_num_node_expand(self):
a = NumNode(42)
assert a.expand() == [a]
def test_variable_expand(self):
a = Variable("a", 5, 7)
assert a.expand() == [a]
def test_variable_expand_expr_none(self):
a = Variable(None, 5, 7)
assert a.expand() == [NumNode(5), NumNode(6), NumNode(7)]
def test_mul_node_expand(self):
a = Variable(None, 5, 7)
m = MulNode(a, 3)
assert m.expand() == [NumNode(15), NumNode(18), NumNode(21)]
b = Variable("b", 1, 3)
n = MulNode(b, 3)
assert n.expand() == [Variable("b", 1, 3)*3]
def test_sum_node_expand(self):
a = Variable(None, 1, 3)
b = Variable("b", 5, 7)
s1 = create_rednode(SumNode, [a, b])
assert s1.expand() == [Variable.sum([NumNode(i),b]) for i in range(1,4)]
def test_multi_expand(self):
a = Variable("a", 1, 3)
b = Variable("b", 14, 17)
s1 = create_rednode(SumNode, [a, b])
# expand increments earlier variables faster than later variables (as specified in the argument)
# this behavior was just copied from before, no idea why this should be true
assert s1.expand((a, b)) == [NumNode(x + y) for x in range(b.min, b.max + 1) for y in range(a.min, a.max + 1)]
def test_substitute(self):
a = Variable(None, 1, 3)
b = a + 1
c = b.substitute({a: NumNode(1)})
assert c == NumNode(2)
if __name__ == '__main__':
unittest.main()